In [17]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

### 1. Read data

In [18]:
df = pd.read_csv('../../data/processed/canomical_data.csv')
df.head()

Unnamed: 0,p_categories,p_brand,p_day_created,p_sold_quantity,p_original_price,p_discount_rate
0,7,155,659.0,702,528000,49
1,84,198,974.0,12844,799000,46
2,84,192,1372.0,938,209000,0
3,7,155,593.0,10359,473000,50
4,15,145,529.0,2338,106000,16


### 2. Train test split
&#9889; This cell we choose columns X and y to split train test. Test set with size 20% and Train set with size 80% 

In [19]:
X = df.drop('p_sold_quantity', axis='columns')
y = testLabels = df.p_sold_quantity

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

#### Make polynomial regression by pipeline 

In [20]:
def PolynomialRegression(degree=2, **kwargs):
    return make_pipeline(StandardScaler(), PolynomialFeatures(degree), LinearRegression(**kwargs))

#### Create paramater grid to use GridSearchCV

In [21]:
param_grid = {
   'polynomialfeatures__degree': [3,4,5], 
   'linearregression__fit_intercept': [True, False], 
   'linearregression__normalize': [True, False]
}

In [22]:
gr_search = GridSearchCV(PolynomialRegression(), 
                         param_grid, 
                         cv=10, 
                         scoring='neg_mean_squared_error', 
                         verbose=3) 

#### Fit grid search to find best score and best param 

In [None]:
gr_search.fit(X_train, y_train)

In [24]:
print('Best Score: %s' % gr_search.best_score_)
print('Best Hyperparameters: %s' % gr_search.best_params_)

Best Score: -3.080750700421297e+19
Best Hyperparameters: {'linearregression__fit_intercept': True, 'linearregression__normalize': False, 'polynomialfeatures__degree': 3}


In [25]:
poly_grid = PolynomialRegression(degree=3, fit_intercept=True, normalize=False)

### 3. Train model

In [26]:
poly_grid.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('linearregression', LinearRegression())])

### 4. Predict with model

In [27]:
y_pred = poly_grid.predict(X_test)

### 5. Evaluate Linear Model with Poly nominal features

In [28]:
print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

MSE:  2014186.2213632818
RMSE:  1419.2202864119727
