In [1]:
import pandas as pd
train_model =  pd.read_csv("train_model_cleaned.csv")

In [8]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
#imports
import numpy as np
import pandas as pd
import math

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
%matplotlib inline


In [4]:
train_model = train_model.drop('Unnamed: 0',axis=1)

In [5]:
#create our X and y
X = train_model.drop('Sales', axis=1)
y = train_model['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)

In [6]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print('Training score: {}'.format(lr_model.score(X_train, y_train)))
print('Test score: {}'.format(lr_model.score(X_test, y_test)))

y_pred = lr_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)

print('RMSE: {}'.format(rmse))

Training score: 0.5489182057137239
Test score: 0.5482421567042175
RMSE: 2567.6737275373325


In [9]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]

pipeline = Pipeline(steps)

pipeline.fit(X_train, y_train)

print('Training score: {}'.format(pipeline.score(X_train, y_train)))
print('Test score: {}'.format(pipeline.score(X_test, y_test)))


Training score: 0.5877354654434662
Test score: 0.5873613345118975


## **L2 Regularization or Ridge Regression**

*L2 regression ignores the least significant variables and performs the regression*

*L2 regression is performed when all the variables need not be used*

In [10]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=10, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

print('Training Score: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test Score: {}'.format(ridge_pipe.score(X_test, y_test)))


Training Score: 0.587735448530829
Test Score: 0.5873613220942593


## **L1 Regularization or Lasso Regression**

*L1 regularization does not ignore the least significant variables*

*L1 regularization is used when all the variables must be used*

In [11]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=0.3, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_train, y_train)

print('Training score: {}'.format(lasso_pipe.score(X_train, y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, y_test)))

Training score: 0.5877274240517565
Test score: 0.5873380284194831


**Non-regularization regression**

*Dropping all the least significant variables from the train_model dataset*

In [15]:
train_model = train_model.drop(['Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceYear', 'Promo2SinceWeek', 'PromoInterval'], axis = 1)

In [16]:
from sklearn.model_selection import train_test_split
#Creating the features 

features = train_model.drop('Sales', axis=1)
target = train_model['Sales']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.3, random_state = 42)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(704493, 9)
(301926, 9)
(704493,)
(301926,)


In [20]:
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


#Setting up the scaling pipeline 

pipeline_order = [('scaler', StandardScaler()), ('linear_reg', linear_model.LinearRegression())]

Model_Pipeline = Pipeline(pipeline_order)

# evaluate pipeline
kfold = KFold(n_splits=3, random_state=7)
results = cross_val_score(Model_Pipeline, X_train, y_train, cv=kfold,scoring= 'r2')
Model_Pipeline.fit(X_train, y_train)
preds_train = Model_Pipeline.predict(X_train)
preds = Model_Pipeline.predict(X_test)
print("Train R^2:",round(results.mean(),3), round(results.std(),3))
#print("Train AIC, BIC :",round(calAIC(y_train,preds_train,len(X_train.columns)),3),",", round(calBIC(y_train,preds_train,len(X_train.columns))))
print("-----------------------------------------------")
print("Test R^2:",round(r2_score(y_test, preds),3))
#print("Test AIC, BIC :",round(calAIC(y_test,preds,len(X_test.columns)),3),",", round(calBIC(y_test,preds,len(X_test.columns))))

Train R^2: 0.529 0.0
-----------------------------------------------
Test R^2: 0.528


*The train and test values are same in regularization regression and non-regularization regression.*

## Conclusion

 **The accuracy of the data is 54% for linear regression. To increase the accuracy I used random forest algorithm. After applying random forest, the accuracy has increased to . The accuracy of the data is 74% for logistic regression. We found multicolinearity in one variable i.e, "Promo2". There is no difference between regularization and non-regularization regression.** 

## Contributions Statement

*By own : 80%*

*External Source : 20%*

## Citations

https://www.kaggle.com/elenapetrova/time-series-analysis-and-forecasts-with-prophet

https://www.statisticssolutions.com/assumptions-of-logistic-regression/

https://www.statisticssolutions.com/assumptions-of-linear-regression/
    
https://www.python-course.eu/confusion_matrix.php
    
https://towardsdatascience.com/l1-and-l2-regularization-methods-ce25e7fc831c
    
https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
    
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
    
https://towardsdatascience.com/linear-regression-using-python-ce21aa90ade6
    
https://www.statisticssolutions.com/what-is-logistic-regression/
    
https://www.medcalc.org/manual/logistic_regression.php
    
https://www.statisticssolutions.com/multicollinearity/

https://newonlinecourses.science.psu.edu/stat501/node/343/

## License

Copyright 2019 Harshitha Sanikommu

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.