In [1]:
import pandas as pd
import re
import numpy as np
import itertools
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

## Function Definition

In [2]:
def train_pca(X, components = 3):
    '''
    first = True
    for images in X:
        if first:
            train_for_pca = images
            first = False
        else:
            # print(train_for_pca.shape)
            train_for_pca = np.concatenate([train_for_pca, images])    
    '''
    train_for_pca = X
    scaler = StandardScaler()
    scaler.fit(train_for_pca)
    train_for_pca=scaler.transform(train_for_pca)
    pca = PCA(n_components = components)
    pca.fit(train_for_pca)
    return pca

In [3]:
def dimensional_reduction(X, pca):
    reduced_X = []
    for images in X:
        train_pca = pca.transform(images)
        train_x = np.append(np.concatenate([np.mean(train_pca, axis = 0), np.std(train_pca, axis = 0)]), len(images))
        colnum = train_pca.shape[1]
        for subset in itertools.combinations(range(colnum), 2):
            train_x = np.append(train_x, np.corrcoef(train_pca[:, subset[0]], train_pca[:, subset[1]])[0][1])
            
        reduced_X.append(train_x)        
    reduced_X = np.array(reduced_X)
    return reduced_X

In [4]:
def pca_res(dimension, X, X1, y, y1, PAC, PAC1):
    X_ = np.concatenate(X)
    
    # 训练降维器
    pca = train_pca(X_, components = dimension)
    
    # 降维并提取特征
    reduced_trainX = dimensional_reduction(X, pca)
    reduced_validX = dimensional_reduction(X1, pca)

    # 去掉空值
    yt = np.delete(y, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
    y1v = np.delete(y1, np.where(~np.isnan(reduced_validX).any(axis=1) == False))
    PACt = np.delete(PAC, np.where(~np.isnan(reduced_trainX).any(axis=1) == False))
    PAC1v = np.delete(PAC1, np.where(~np.isnan(reduced_validX).any(axis=1) == False))
    reduced_trainX = reduced_trainX[~np.isnan(reduced_trainX).any(axis=1), :]
    reduced_validX = reduced_validX[~np.isnan(reduced_validX).any(axis=1), :]
    
    return reduced_trainX, reduced_validX, yt, y1v, PACt, PAC1v

In [5]:
def regr_r2(model, tX, tY, vX, vY):
    model.fit(tX, tY)
    vy_pred = model.predict(vX)
    return r2_score(vY, vy_pred)

## Readin Data

In [None]:
# 读入原始数据
data = pd.read_csv('Data/2017_features.csv')
data = data.dropna()
#data = data[data['features']!='adsadas']
label = pd.read_csv('Data/PAC_GDP17.csv')
data = pd.merge(data, label, how='left')
X = []
y = []
PAC = []
for i in data.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X.append(x_i)
    y.append(data['GDP'].loc[i])
    PAC.append(data['PAC'].loc[i])
    #except:
       # print('error')
zeros = [X.index(x) for x in X if x.shape[0] == 0]
X = np.delete(np.array(X), zeros)
y = np.delete(np.array(y), zeros)
PAC = np.delete(np.array(PAC), zeros)

In [None]:
# 读入2018年数据
data1 = pd.read_csv('Data/2018_features.csv')
data1 = data1.dropna()
label1 = pd.read_csv('Data/PAC_GDP18.csv')
data1 = pd.merge(data1, label1, how='left')
X1 = []
y1 = []
PAC1 = []
for i in data1.index:
    if i % 100 == 0:
        print(i)
    #try:
    x_i = [float(x) for x in re.split(r', |\[|\]', data1['features'].loc[i]) if len(x) > 0]
    x_i = np.array(x_i).reshape(-1, 4096)
    X1.append(x_i)
    y1.append(data1['GDP'].loc[i])
    PAC1.append(data1['PAC'].loc[i])
    #except:
       # print('error')
zeros1 = [X1.index(x) for x in X1 if x.shape[0] == 0]
X1 = np.delete(np.array(X1), zeros1)
y1 = np.delete(np.array(y1), zeros1)
PAC1 = np.delete(np.array(PAC1), zeros1)

In [None]:
ridgel = []
ridgel2 = []
lassol = []
lassol2 = []
rfl = []
rfl2 = []
gbll = []
gbll2 = []
gbtl = []
gbtl2 = []

In [None]:
pca_range = [i for i in range(3, 26, 1)]
for d in range(3, 26, 1):
    ### 降维
    print('=======', d, '=======')
    reduced_trainX, reduced_validX, yt, y1v, PACt, PAC1v = pca_res(d, X, X1, y, y1, PAC, PAC1)
    
    ### 特征构造
    poly = PolynomialFeatures(interaction_only = False, include_bias = False)
    tX_interact = poly.fit_transform(reduced_trainX)
    vX_interact = poly.fit_transform(reduced_validX)
    
    ### ridge
    ridgel.append(regr_r2(Ridge(), reduced_trainX, np.log(yt), reduced_validX, np.log(y1v)))
    ridgel2.append(regr_r2(Ridge(), tX_interact, np.log(yt), vX_interact, np.log(y1v)))
    print('ridge:', ridgel[-1], ridgel2[-1])
    
    ### lasso
    lassol.append(regr_r2(Lasso(), reduced_trainX, np.log(yt), reduced_validX, np.log(y1v)))
    lassol2.append(regr_r2(Lasso(), tX_interact, np.log(yt), vX_interact, np.log(y1v)))
    print('lasso:', lassol[-1], lassol2[-1])
    
    ### random forest
    RandomForestRegressor(n_estimators=10)
    rfl.append(regr_r2(RandomForestRegressor(n_estimators=10), reduced_trainX, np.log(yt), reduced_validX, np.log(y1v)))
    rfl2.append(regr_r2(RandomForestRegressor(n_estimators=10), tX_interact, np.log(yt), vX_interact, np.log(y1v)))
    print('random forest:', rfl[-1], rfl2[-1])
    
    ### gblinear
    linear_params_opt = {'objective': 'reg:linear', 'booster': 'gblinear', 'random_state': 42,
                     'updater': 'shotgun', 'feature_selector': 'cyclic', 'alpha': 0, 'lambda': 99}
    gbll.append(regr_r2(xgb.XGBRegressor(**linear_params_opt), reduced_trainX, np.log(yt), reduced_validX, np.log(y1v)))
    gbll2.append(regr_r2(xgb.XGBRegressor(**linear_params_opt), tX_interact, np.log(yt), vX_interact, np.log(y1v)))
    print('gblinear:', gbll[-1], gbll2[-1])
    
    ### gbtree
    tree_params_opt = {'objective': 'reg:linear', 'booster': 'gbtree', 'random_state': 42,
              'subsample': 0.85, 'eta': 0.09, 'lambda': 0.1, 'min_child_weight': 75}
    gbtl.append(regr_r2(xgb.XGBRegressor(**tree_params_opt), reduced_trainX, np.log(yt), reduced_validX, np.log(y1v)))
    gbtl2.append(regr_r2(xgb.XGBRegressor(**tree_params_opt), tX_interact, np.log(yt), vX_interact, np.log(y1v)))
    print('gbtree:', gbtl[-1], gbtl2[-1])

In [None]:
# 存储结果
regr_df = pd.DataFrame({'PCA': pca_range,
                       'ridge': ridgel,
                       'ridge_i': ridgel2,
                       'lasso': lassol,
                       'lasso_i': lassol2,
                       'rf': rfl,
                       'rf_i': rfl2,
                       'gblinear': gbll,
                       'gblinear_i': gbll2,
                       'gbtree': gbtl,
                       'gbtree_i': gbtl2})
regr_df.to_csv('Result/regr3-25_aug.csv', index = False, header = True)
print(regr_df)

In [None]:
regr_df