In [1]:
import pandas as pd
import re
import numpy as np
import itertools
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

## Function Definition

In [2]:
def train_pca(X, components = 3):
    '''
    first = True
    for images in X:
        if first:
            train_for_pca = images
            first = False
        else:
            # print(train_for_pca.shape)
            train_for_pca = np.concatenate([train_for_pca, images])    
    '''
    train_for_pca = X
    scaler = StandardScaler()
    scaler.fit(train_for_pca)
    train_for_pca=scaler.transform(train_for_pca)
    pca = PCA(n_components = components)
    pca.fit(train_for_pca)
    return pca

In [3]:
def dimensional_reduction_ablation(X, pca):
    reduced_X = []
    reduced_X_mean = []
    reduced_X_std = []
    reduced_X_len = []
    reduced_X_rho = []
    
    for images in X:
        train_pca = pca.transform(images)
        # embedded statistics
        train_x = np.append(np.concatenate([np.mean(train_pca, axis = 0), np.std(train_pca, axis = 0)]), len(images))
        train_x_mean = np.append(np.std(train_pca, axis = 0), len(images))
        train_x_std = np.append(np.mean(train_pca, axis = 0), len(images))
        train_x_len = np.concatenate([np.mean(train_pca, axis = 0), np.std(train_pca, axis = 0)])
        train_x_rho = np.append(np.concatenate([np.mean(train_pca, axis = 0), np.std(train_pca, axis = 0)]), len(images))
        
        # coor
        colnum = train_pca.shape[1]
        for subset in itertools.combinations(range(colnum), 2):
            train_x = np.append(train_x, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
            train_x_mean = np.append(train_x_mean, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
            train_x_std = np.append(train_x_std, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
            train_x_len = np.append(train_x_len, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
        
        #instance
        reduced_X.append(train_x)
        reduced_X_mean.append(train_x_mean)
        reduced_X_std.append(train_x_std)
        reduced_X_len.append(train_x_len)
        reduced_X_rho.append(train_x_rho)
        
    reduced_X = np.array(reduced_X)
    reduced_X_mean = np.array(reduced_X_mean)
    reduced_X_std = np.array(reduced_X_std)
    reduced_X_len = np.array(reduced_X_len)
    reduced_X_rho = np.array(reduced_X_rho)
    
    return reduced_X, reduced_X_mean, reduced_X_std, reduced_X_len, reduced_X_rho

In [4]:
def linear_regression(X, y, fold = 4, seed = 42):
    regr = xgb.XGBRegressor(objective="reg:linear", random_state=seed)
    
    # k-fold
    kf = KFold(n_splits = 4, shuffle = True, random_state = seed)
    scores = []
    for train_ids, valid_ids in kf.split(X):
        trainX = X[train_ids]; trainY = y[train_ids]
        validX = X[valid_ids]; validY = y[valid_ids]
        
        regr.fit(trainX, trainY)
        y_pred = regr.predict(validX)
        r2_val = r2_score(validY, y_pred)
        scores.append(r2_val)
        
    return np.mean(scores)

In [5]:
def pca_res_ablation_both(dimension, X, X1, y, y1, PAC, PAC1):
    X_ = np.concatenate(X)
    
    # 训练降维器
    pca = train_pca(X_, components = dimension)
    
    # 降维并提取特征
    reduced_trainX, reduced_trainX_mean, reduced_trainX_std, reduced_trainX_len, reduced_trainX_rho = dimensional_reduction_ablation(X, pca)
    reduced_validX, reduced_validX_mean, reduced_validX_std, reduced_validX_len, reduced_validX_rho = dimensional_reduction_ablation(X1, pca)
    trainSet = {'whole': reduced_trainX,
               'mean': reduced_trainX_mean,
               'std': reduced_trainX_std,
               'len': reduced_trainX_len,
               'rho': reduced_trainX_rho}
    validSet = {'whole': reduced_validX,
               'mean': reduced_validX_mean,
               'std': reduced_validX_std,
               'len': reduced_validX_len,
               'rho': reduced_validX_rho}

    ### 去掉空值
    reduced_trainSet = {}
    reduced_validSet = {}
    # 针对每一种情况分别去空值
    for cls in trainSet.keys():
        yt = np.delete(y, np.where(~np.isnan(trainSet[cls]).any(axis=1) == False))
        y1v = np.delete(y1, np.where(~np.isnan(validSet[cls]).any(axis=1) == False))
        PACt = np.delete(PAC, np.where(~np.isnan(trainSet[cls]).any(axis=1) == False))
        PAC1v = np.delete(PAC1, np.where(~np.isnan(validSet[cls]).any(axis=1) == False))
        trainX = (trainSet[cls])[~np.isnan(trainSet[cls]).any(axis=1), :]
        validX = (validSet[cls])[~np.isnan(validSet[cls]).any(axis=1), :]
        
        reduced_trainSet[cls] = [trainX, yt, PACt]
        reduced_validSet[cls] = [validX, y1v, PAC1v]
        
    
    return reduced_trainSet, reduced_validSet

def pca_res_ablation(dimension, X, y, PAC):
    X_ = np.concatenate(X)
    
    # 训练降维器
    pca = train_pca(X_, components = dimension)
    
    # 降维并提取特征
    reduced_trainX, reduced_trainX_mean, reduced_trainX_std, reduced_trainX_len, reduced_trainX_rho = dimensional_reduction_ablation(X, pca)
    trainSet = {'whole': reduced_trainX,
               'mean': reduced_trainX_mean,
               'std': reduced_trainX_std,
               'len': reduced_trainX_len,
               'rho': reduced_trainX_rho}

    ### 去掉空值
    reduced_trainSet = {}
    # 针对每一种情况分别去空值
    for cls in trainSet.keys():
        yt = np.delete(y, np.where(~np.isnan(trainSet[cls]).any(axis=1) == False))
        PACt = np.delete(PAC, np.where(~np.isnan(trainSet[cls]).any(axis=1) == False))
        trainX = (trainSet[cls])[~np.isnan(trainSet[cls]).any(axis=1), :]
        
        reduced_trainSet[cls] = [trainX, yt, PACt]
        
    
    return reduced_trainSet

## Readin Data

In [None]:
# 读入原始数据
data = pd.read_csv('Data/2017_features.csv')
data = data.dropna()
#data = data[data['features']!='adsadas']
label = pd.read_csv('Data/PAC_GDP17.csv')
data = pd.merge(data, label, how='left')
X = []
y = []
PAC = []
for i in data.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X.append(x_i)
    y.append(data['GDP'].loc[i])
    PAC.append(data['PAC'].loc[i])
    #except:
       # print('error')
zeros = [X.index(x) for x in X if x.shape[0] == 0]
X = np.delete(np.array(X), zeros)
y = np.delete(np.array(y), zeros)
PAC = np.delete(np.array(PAC), zeros)

## Identifying the Optimal Dimension of PCA via K-Fold Cross-Validation

In [None]:
all_res = []
pca_range = [i for i in range(10, 26, 1)]
for d in pca_range:
    # 对每一个维度降维
    print('==========', d, '==========')
    cur_res = []
    reduced_trainSet = pca_res_ablation(d, X, y, PAC)
    
    # 对每一种统计量计算方法
    for cls in reduced_trainSet.keys():
        score = linear_regression(reduced_trainSet[cls][0], np.log(reduced_trainSet[cls][1]))
        cur_res.append(score)
        print(cls, score)
        
    all_res.append(cur_res)   
    print('dimension:', d, ' results:', cur_res)

In [None]:
all_res = np.array(all_res)

In [None]:
all_res

## GridSearch for the Optimal Parameters of XGBboost

In [None]:
# 读入2018年数据
data1 = pd.read_csv('Data/2018_features.csv')
data1 = data1.dropna()
label1 = pd.read_csv('Data/PAC_GDP18.csv')
data1 = pd.merge(data1, label1, how='left')
X1 = []
y1 = []
PAC1 = []
for i in data1.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data1['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X1.append(x_i)
    y1.append(data1['GDP'].loc[i])
    PAC1.append(data1['PAC'].loc[i])
    #except:
       # print('error')
zeros1 = [X1.index(x) for x in X1 if x.shape[0] == 0]
X1 = np.delete(np.array(X1), zeros1)
y1 = np.delete(np.array(y1), zeros1)
PAC1 = np.delete(np.array(PAC1), zeros1)

In [None]:
### search for the optimal parameters for each condition respectively

# just a sample
pca_opt = 18 # sample
reduced_trainSet, reduced_validSet = pca_res_ablation_both(pca_opt, X, X1, y, y1, PAC, PAC1)

# just a sample
tX_mean, tY_mean, tPAC_mean = reduced_trainSet['mean']
vX_mean, vY_mean, vPAC_mean = reduced_validSet['mean']

In [None]:
# just a sample for tuning
tree_params = {'objective': 'reg:linear', 'booster': 'gbtree', 'random_state': 42}
cv_params = {'eta': np.arange(0.1, 10, 3),
             'subsample': np.arange(0.5, 1, 0.2),
            'lambda': np.arange(1, 100, 30),
             'min_child_weight': np.arange(1, 100, 30)}
model = xgb.XGBRegressor(**tree_params)
optimized_GBM = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=4, verbose=1)
optimized_GBM.fit(tX_mean, np.log(tY_mean))

In [None]:
# results
print('参数的最佳取值：{0}'.format(optimized_GBM.best_params_))
print('最佳模型得分:{0}'.format(optimized_GBM.best_score_))

In [None]:
### interate the tuning process

In [None]:
# optimal parameters in our experiments

# ablation mean
# pca = 18
opt_params_mean = {'objective': 'reg:linear', 'booster': 'gbtree', 'random_state': 42,
              'subsample': 0.7, 'min_child_weight': 30, 'eta': 0.07, 'lambda': 3.5}
opt_model_mean = xgb.XGBRegressor(**opt_params_mean)

In [None]:
opt_model_mean.fit(tX_mean, np.log(tY_mean))
y_pred_mean = opt_model_mean.predict(vX_mean)
r2_score(np.log(vY_mean), y_pred_mean)