In [1]:
import pandas as pd
import re
import numpy as np
import itertools
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

## Function Definition

In [2]:
def train_pca(X, components = 3):
    '''
    first = True
    for images in X:
        if first:
            train_for_pca = images
            first = False
        else:
            # print(train_for_pca.shape)
            train_for_pca = np.concatenate([train_for_pca, images])    
    '''
    train_for_pca = X
    scaler = StandardScaler()
    scaler.fit(train_for_pca)
    train_for_pca=scaler.transform(train_for_pca)
    pca = PCA(n_components = components)
    pca.fit(train_for_pca)
    return pca

In [3]:
def dimensional_reduction(X, pca):
    reduced_X = []
    for images in X:
        train_pca = pca.transform(images)
        train_x = np.append(np.concatenate([np.mean(train_pca, axis = 0), np.std(train_pca, axis = 0)]), len(images))
        colnum = train_pca.shape[1]
        for subset in itertools.combinations(range(colnum), 2):
            train_x = np.append(train_x, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
            
        reduced_X.append(train_x)        
    reduced_X = np.array(reduced_X)
    return reduced_X

In [4]:
def linear_regression(X, y, fold = 4, seed = 42):
    regr = xgb.XGBRegressor(objective="reg:linear", random_state=seed)
    
    # k折交叉
    kf = KFold(n_splits = 4, shuffle = True, random_state=seed)
    scores = []
    for train_ids, valid_ids in kf.split(X):
        trainX = X[train_ids]; trainY = y[train_ids]
        validX = X[valid_ids]; validY = y[valid_ids]
        
        regr.fit(trainX, trainY)
        y_pred = regr.predict(validX)
        r2_val = r2_score(validY, y_pred)
        scores.append(r2_val)
        
    return np.mean(scores)

In [6]:
temp = [1, 2, 33, 3]
temp.index(max(temp))

2

## Readin Data

In [None]:
# 读入原始数据
data = pd.read_csv('Data/2017_features.csv')
data = data.dropna()
#data = data[data['features']!='adsadas']
label = pd.read_csv('Data/PAC_GDP17.csv')
data = pd.merge(data, label, how='left')
X = []
y = []
PAC = []
for i in data.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X.append(x_i)
    y.append(data['GDP'].loc[i])
    PAC.append(data['PAC'].loc[i])
    #except:
       # print('error')
zeros = [X.index(x) for x in X if x.shape[0] == 0]
X = np.delete(np.array(X), zeros)
y = np.delete(np.array(y), zeros)
PAC = np.delete(np.array(PAC), zeros)
X_ = np.concatenate(X)

## Identifying the Optimal Dimension of PCA via K-Fold Cross-Validation

In [None]:
r2_scores = []
# 训练PCA降维器并提取特征
pca_range = [i for i in range(3, 26, 1)]
for dimension in pca_range:
    print('=======', dimension, '=======')
    
    # 训练降维器
    pca = train_pca(X_, components = dimension)
    
    # 降维并提取特征
    reduced_trainX = dimensional_reduction(X, pca)
    
    # 去掉空值
    yt = np.delete(y, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
    PACt = np.delete(PAC, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
    reduced_trainX = reduced_trainX[~np.isnan(reduced_trainX).any(axis=1), :]
    
    # 计算r2
    r2 = linear_regression(reduced_trainX, np.log(yt))
    r2_scores.append(r2)
    print('r2:', r2)

In [None]:
r2_scores

## GridSearch for the Optimal Parameters of XGBboost

In [None]:
pca_opt = pca_range[r2_scores.index(max(r2_scores))] # optimal value in our experiments

In [None]:
# 训练降维器
pca = train_pca(X_, components = pca_opt)
    
# 降维并提取特征
reduced_trainX = dimensional_reduction(X, pca)
    
# 去掉空值
yt = np.delete(y, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
PACt = np.delete(PAC, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
reduced_trainX = reduced_trainX[~np.isnan(reduced_trainX).any(axis=1), :]

In [None]:
# 存储中间结果
tX_df = pd.DataFrame(reduced_trainX)
tX_df['Y'] = yt
tX_df['PAC'] = PACt
tX_df.to_csv('Process/trainData2017.csv', index = False, header = True)
# 释放内存
X = []
X_ = []

#### Coarse Tuning + Delcate Tuning

In [None]:
# just a sample
tree_params = {'objective': 'reg:linear', 'booster': 'gbtree', 'random_state': 42}
cv_params = {'eta': np.arange(0.1, 10, 3),
             'subsample': np.arange(0.5, 1, 0.2),
            'lambda': np.arange(1, 100, 30),
             'min_child_weight': np.arange(1, 100, 30)}
model = xgb.XGBRegressor(**tree_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=4, verbose=1)
optimized_GBM.fit(reduced_trainX, np.log(yt))

In [None]:
# results
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

In [7]:
### iterate the tuning process

In [None]:
# optimal parameters in our experiments
# PCA = 21
opt_params = {'objective': 'reg:linear', 'booster': 'gbtree', 'random_state': 42,
              'subsample': 0.85, 'eta': 0.09, 'lambda': 0.1, 'min_child_weight': 75}
opt_model = xgb.XGBRegressor(**opt_params)

## Prediction

In [None]:
# 读入2018年数据
data1 = pd.read_csv('Data/2018_features.csv')
data1 = data1.dropna()
label1 = pd.read_csv('Data/PAC_GDP18.csv')
data1 = pd.merge(data1, label1, how='left')
X1 = []
y1 = []
PAC1 = []
for i in data1.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data1['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X1.append(x_i)
    y1.append(data1['GDP'].loc[i])
    PAC1.append(data1['PAC'].loc[i])
    #except:
       # print('error')
zeros1 = [X1.index(x) for x in X1 if x.shape[0] == 0]
X1 = np.delete(np.array(X1), zeros1)
y1 = np.delete(np.array(y1), zeros1)
PAC1 = np.delete(np.array(PAC1), zeros1)

In [None]:
# 降维
reduced_validX = dimensional_reduction(X1, pca)
    
# 去掉空值
y1v = np.delete(y1, np.where(~np.isnan(reduced_validX).any(axis=1) == False))
PAC1v = np.delete(PAC1, np.where(~np.isnan(reduced_validX).any(axis=1) == False))
reduced_validX = reduced_validX[~np.isnan(reduced_validX).any(axis=1), :]

# 存储结果
vX_df = pd.DataFrame(reduced_validX)
vX_df['Y'] = y1v
vX_df['PAC'] = PAC1v
vX_df.to_csv('Process/validData2018.csv', index = False, header = True)

In [None]:
opt_model.fit(reduced_trianX, np.log(yt))
y_pred = opt_model.predict(reduced_validX)
r2_score(np.log(y1v), y_pred)

In [None]:
pred_df = pd.DataFrame({'pred_y': np.exp(y_pred), 'y': y1v, 'PAC': PAC1v})
pred_df.to_csv('Result/pred2018_PCA{}_XGB.csv'.format(pca_opt), header = True, index = False)