# Model Validation

In [1]:
# import the relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

In [2]:
train_data_file = 'dataset/zhengqi_train.txt'
test_data_file = 'dataset/zhengqi_test.txt'
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [3]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

feature_columns = [col for col in train_data.columns if col not in ['target']]
train_data_scaler = MinMaxScaler().fit_transform(train_data[feature_columns])
test_data_scaler = MinMaxScaler().fit_transform(test_data[feature_columns])

train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = feature_columns

test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = feature_columns

train_data_scaler['target'] = train_data['target']

In [4]:
# PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=16)
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:, 0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)

new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_test_pca_16 = pd.DataFrame(new_test_pca_16)

new_train_pca_16['target'] = train_data_scaler['target']

In [5]:
# Split Data
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
train_data, test_data, train_target, test_target = train_test_split(train, target, test_size=0.2, random_state=0)

In [6]:
# L2 Regularization
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L2', alpha=0.0001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGD Regression train MSE:", score_train)
print("SGD Regression test MSE:", score_test)

SGD Regression train MSE: 0.1343463751860323
SGD Regression test MSE: 0.14275454169114657


In [7]:
# L1 Regularization
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty='L1', alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGD Regression train MSE:", score_train)
print("SGD Regression test MSE:", score_test)

SGD Regression train MSE: 0.1342177604009307
SGD Regression test MSE: 0.14258693826199498


In [8]:
# ElasticNet L1 and L2
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.fit_transform(test_data)
clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'elasticnet', l1_ratio=0.9, alpha=0.00001)
clf.fit(train_data_poly, train_target)
score_train = mean_squared_error(train_target, clf.predict(train_data_poly))
score_test = mean_squared_error(test_target, clf.predict(test_data_poly))
print("SGDRegressor train MSE:   ", score_train)
print("SGDRegressor test MSE:   ", score_test)

SGDRegressor train MSE:    0.13415067586854265
SGDRegressor test MSE:    0.14260574057392084


In [9]:
# K-fold Cross Validation
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000, tol=1e-3, penalty= 'elasticnet', l1_ratio=0.9, alpha=0.00001)
    clf.fit(train_data, train_target)
    score_train = mean_squared_error(train_target, clf.predict(train_data))
    score_test = mean_squared_error(test_target, clf.predict(test_data))
    print(k, " fold", "SGDRegressor train MSE:   ", score_train)
    print(k, " fold", "SGDRegressor test MSE:   ", score_test, '\n')
    if k >= 9:
        break

0  fold SGDRegressor train MSE:    0.15009384590318076
0  fold SGDRegressor test MSE:    0.10663147818020594 

1  fold SGDRegressor train MSE:    0.13353413554386162
1  fold SGDRegressor test MSE:    0.18187494094104797 

2  fold SGDRegressor train MSE:    0.14715729594990137
2  fold SGDRegressor test MSE:    0.13289857051633358 

3  fold SGDRegressor train MSE:    0.1405984582954384
3  fold SGDRegressor test MSE:    0.1621114738344185 

4  fold SGDRegressor train MSE:    0.13813393804556995
4  fold SGDRegressor test MSE:    0.16466435947630104 

