In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error as MSE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
import pickle

# Import the train data

In [2]:
X_train = pd.read_csv("A_Normalised_train_data.csv",index_col=0)
Y_train = pd.read_csv("A_Log_sales_train.csv",index_col=0)

# Setup Pipeline

In [3]:
poly = PolynomialFeatures()
pca=PCA()
SLR = LinearRegression()

In [4]:
# Step 1: Polynomial transform
# Step 2: PCA
# Step 3: Model
pipe = Pipeline(steps=[("poly",poly),("pca",pca),("SLR",SLR)])

In [5]:
param_grid = {
    "pca__n_components": [5, 10, 30, 45, 60, 100 ,150 ,170],
    "poly__degree":[1,2]
}

# Setup GridSearch

In [6]:
score_metric = 'neg_mean_squared_error'
search = GridSearchCV(pipe,param_grid,
                      n_jobs=-1,
                      cv=10,
                      scoring=score_metric)

In [7]:
search.fit(X_train,Y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('pca', PCA()),
                                       ('SLR', LinearRegression())]),
             n_jobs=-1,
             param_grid={'pca__n_components': [5, 10, 30, 45, 60, 100, 150,
                                               170],
                         'poly__degree': [1, 2]},
             scoring='neg_mean_squared_error')

In [8]:
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=-0.788):
{'pca__n_components': 100, 'poly__degree': 1}


In [9]:
df = pd.DataFrame(search.cv_results_)
df = df.sort_values(by=['rank_test_score'])
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_poly__degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
10,0.225918,0.0362,0.01057,0.009892,100,1,"{'pca__n_components': 100, 'poly__degree': 1}",-0.729526,-0.69735,-0.9006267,-0.815396,-0.9327854,-0.7305027,-0.661766,-0.7791937,-0.835691,-0.7998152,-0.7882653,0.08232052,1
8,0.163107,0.025236,0.017836,0.007223,60,1,"{'pca__n_components': 60, 'poly__degree': 1}",-0.795261,-0.777727,-0.9072554,-0.849808,-0.9695119,-0.7225244,-0.711688,-0.8200532,-0.834522,-0.8469723,-0.8235323,0.07436494,2
6,0.148209,0.029756,0.014943,0.007104,45,1,"{'pca__n_components': 45, 'poly__degree': 1}",-0.777245,-0.805396,-0.9771198,-0.838812,-1.000214,-0.7201219,-0.727422,-0.8712747,-0.904705,-0.9637198,-0.8586031,0.09677952,3
13,13.746228,0.427282,0.216366,0.049057,150,2,"{'pca__n_components': 150, 'poly__degree': 2}",-0.69898,-0.700032,-1.91843,-0.842423,-0.9682042,-0.7423126,-0.65944,-0.7071209,-0.848698,-0.8387807,-0.8924421,0.3537385,4
15,12.368141,1.768432,0.109383,0.031487,170,2,"{'pca__n_components': 170, 'poly__degree': 2}",-0.667494,-0.657039,-2.325401,-0.745082,-0.9383048,-0.709072,-0.653069,-0.6795051,-0.843483,-0.8014074,-0.9019858,0.4826225,5
4,0.121827,0.016889,0.01416,0.004729,30,1,"{'pca__n_components': 30, 'poly__degree': 1}",-0.879726,-0.919351,-1.044104,-0.984815,-1.095151,-0.8146579,-0.820627,-0.943265,-0.968395,-1.003846,-0.9473938,0.08676641,6
11,10.662464,0.319814,0.19309,0.029731,100,2,"{'pca__n_components': 100, 'poly__degree': 2}",-0.901719,-0.874478,-1.014357,-1.000214,-1.163222,-0.872643,-0.835464,-0.9040029,-1.001529,-1.026402,-0.9594031,0.09433365,7
2,0.077698,0.00965,0.008162,0.009066,10,1,"{'pca__n_components': 10, 'poly__degree': 1}",-0.952411,-0.984485,-1.114687,-1.051754,-1.155512,-0.7941115,-0.82812,-0.9412005,-0.97644,-0.9983916,-0.9797113,0.1069357,8
9,8.008291,0.306334,0.149911,0.02034,60,2,"{'pca__n_components': 60, 'poly__degree': 2}",-1.085701,-1.043954,-1.24172,-1.228472,-1.339021,-0.9439355,-1.062267,-1.132183,-1.136997,-1.26701,-1.148126,0.1140094,9
0,0.080007,0.00474,0.00963,0.004875,5,1,"{'pca__n_components': 5, 'poly__degree': 1}",-1.16339,-1.145859,-1.24788,-1.463938,-1.468539,-0.9638166,-1.176606,-1.112895,-1.196551,-1.095377,-1.203485,0.1493633,10


# Best Hyperparameter

In [10]:
parms = pd.DataFrame(df['params'])
for i in parms.iloc[0]:
    print(i)

{'pca__n_components': 100, 'poly__degree': 1}


# # Evaluate Model $log(Sales)$

In [11]:
best_pca =100
best_degree=1

In [12]:
X_train1,X_test1,Y_train1,Y_test1 = train_test_split(X_train,Y_train,test_size=0.2,random_state=132)

In [13]:
poly1 = PolynomialFeatures(degree=1)

In [14]:
pca1 = PCA(n_components=100)
X_train1 = poly1.fit_transform(X_train1)
pca1.fit(X_train1)
PX_train1 = pca1.transform(X_train1)
SLR1 = LinearRegression()
SLR1.fit(PX_train1,Y_train1)

LinearRegression()

In [15]:
pca2 = PCA(n_components=100)
X_test1=poly1.fit_transform(X_test1)
pca2.fit(X_test1)
PX_test1 = pca2.transform(X_test1)

In [16]:
MSE(SLR1.predict(PX_test1),Y_test1)

2.1048112303464

In [27]:
Y_test1['sales']

2085    4.143135
222     3.332205
236     3.401197
15      0.000000
2069    3.433987
          ...   
1409    1.098612
397     1.609438
2009    3.332205
233     3.367296
1408    2.302585
Name: sales, Length: 463, dtype: float64

In [32]:
result = pd.DataFrame(np.exp(Y_test1))

In [34]:
result["Predicted_sales"] = np.exp(SLR1.predict(PX_test1))

In [35]:
MSE(result["sales"],result["Predicted_sales"])

1242.7797407888436