In [64]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

### 1. Read data

In [65]:
df = pd.read_csv('../../data/processed/canomical_data.csv')
df.head()

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_discount_rate
0,7,155,659.0,702,528000,49
1,84,198,974.0,12844,799000,46
2,84,192,1372.0,938,209000,0
3,7,155,593.0,10359,473000,50
4,15,145,529.0,2338,106000,16


### 2. Train test split
&#9889; This cell we choose columns X and y to split train test. Test set with size 20% and Train set with size 80% 

In [66]:
X = df.drop(['p_sold_quantity', 'p_brand', 'p_categories', 'p_original_price'], axis='columns')
y = testLabels = df.p_sold_quantity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

#### Make polynomial regression by pipeline 

In [67]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(StandardScaler(), PolynomialFeatures(degree), LinearRegression(**kwargs))

#### Create paramater grid to use GridSearchCV

In [68]:
param_grid = {
   'polynomialfeatures__degree': [3,4,5], 
   'linearregression__fit_intercept': [True, False], 
   'linearregression__normalize': [True, False]
}

In [69]:
gr_search = GridSearchCV(PolynomialRegression(), 
                         param_grid, 
                         cv=20, 
                         scoring='neg_mean_squared_error', 
                         verbose=3) 

#### Fit grid search to find best score and best param 

In [70]:
gr_search.fit(X_train, y_train)

Fitting 20 folds for each of 12 candidates, totalling 240 fits
[CV 1/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-1708423.523 total time=   0.0s
[CV 2/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-1333553.509 total time=   0.0s
[CV 3/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-1643157.396 total time=   0.0s
[CV 4/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-8756005.515 total time=   0.0s
[CV 5/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-1347332.540 total time=   0.0s
[CV 6/20] END linearregression__fit_intercept=True, linearregression__normalize=True, polynomialfeatures__degree=3;, score=-528035.647 total time=   0.0s
[CV 7/20

GridSearchCV(cv=20,
             estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('polynomialfeatures',
                                        PolynomialFeatures()),
                                       ('linearregression',
                                        LinearRegression())]),
             param_grid={'linearregression__fit_intercept': [True, False],
                         'linearregression__normalize': [True, False],
                         'polynomialfeatures__degree': [3, 4, 5]},
             scoring='neg_mean_squared_error', verbose=3)

In [71]:
print('Best Score: %s' % gr_search.best_score_)
print('Best Hyperparameters: %s' % gr_search.best_params_)

Best Score: -2.7752241460480483e+19
Best Hyperparameters: {'linearregression__fit_intercept': True, 'linearregression__normalize': True, 'polynomialfeatures__degree': 3}


In [76]:
poly_grid = PolynomialRegression(degree=3, fit_intercept=True, normalize=True)

### 3. Train model

In [77]:
poly_grid.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression(normalize=True))])

### 4. Predict with model

In [78]:
y_pred = poly_grid.predict(X_test)

### 5. Evaluate Linear Model with Poly nominal features

In [79]:
print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

MSE:  982155.1543818987
RMSE:  991.0374132099649
