In [1]:
import pandas as pd
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading Data set
sales = pd.read_csv('./sales.csv')
sales = sales.drop(['Unnamed: 0'],axis=1)
sales

Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,366,4,2013-04-18,517,1,0,0,0,4422
1,394,6,2015-04-11,694,1,0,0,0,8297
2,807,4,2013-08-29,970,1,1,0,0,9729
3,802,2,2013-05-28,473,1,1,0,0,6513
4,726,4,2013-10-10,1068,1,1,0,0,10882
...,...,...,...,...,...,...,...,...,...
640835,409,6,2013-10-26,483,1,0,0,0,4553
640836,97,1,2014-04-14,987,1,1,0,0,12307
640837,987,1,2014-07-07,925,1,0,0,0,6800
640838,1084,4,2014-06-12,725,1,0,0,0,5344


In [4]:
# Checking data types : only two objects can convert them into numericals
sales.dtypes

store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

In [5]:
#  checking for null values : seems no null values 
sales.isnull().sum()

store_ID               0
day_of_week            0
date                   0
nb_customers_on_day    0
open                   0
promotion              0
state_holiday          0
school_holiday         0
sales                  0
dtype: int64

## Data cleaning

In [6]:
# converting object i.e date column
def to_datetime(dataframe, column):

    dataframe[column] = pd.to_datetime(dataframe[column])

    dataframe['year'] = dataframe[column].dt.year
    dataframe['month'] = dataframe[column].dt.month
    dataframe['day'] = dataframe[column].dt.day
    
    dataframe = dataframe.drop(column, axis = 1)

    return dataframe

sales = to_datetime(sales, 'date')

In [7]:
# combining the values to 0 and 1 completely to numeric
def reduce_state_holiday(x):

    if x == 'a':
        return 1

    elif x == 'b':
        return 1

    elif x == 'c':
        return 1

    else:
        return int(x)

sales['state_holiday'] = sales['state_holiday'].apply(reduce_state_holiday)

In [19]:
sales.head()

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month,day
0,366,4,517,1,0,0,0,4422,2013,4,18
1,394,6,694,1,0,0,0,8297,2015,4,11
2,807,4,970,1,1,0,0,9729,2013,8,29
3,802,2,473,1,1,0,0,6513,2013,5,28
4,726,4,1068,1,1,0,0,10882,2013,10,10


In [9]:
sales['state_holiday'].value_counts()

0    621160
1     19680
Name: state_holiday, dtype: int64

In [20]:
# checking how the sales were in holidays 
avg_sales = sales[(sales['state_holiday'] == 1) & (sales['sales'] > 0)]

In [22]:
avg_sales1 = sales[(sales['state_holiday'] == 0) & (sales['sales'] > 0)]

## X-y Split

In [12]:
y = sales['sales']
X = sales.drop(['sales'], axis = 1)

In [13]:
# Creating the train and the test.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [14]:
# Minmax scaler
from sklearn.preprocessing import MinMaxScaler
transformer = MinMaxScaler().fit(X_train)
X_normalized_tr = transformer.transform(X_train)
X_train_normalized = pd.DataFrame(X_normalized_tr, columns=X_train.columns)

transformer = MinMaxScaler().fit(X_test)
X_normalized_te = transformer.transform(X_test)
X_test_normalized = pd.DataFrame(X_normalized_te, columns=X_test.columns)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

## Applying models

In [17]:
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import mean_squared_error
# import math

# from sklearn.tree import DecisionTreeRegressor
# model1 = DecisionTreeRegressor(max_depth = None,
#                                 criterion = 'mse',
#                                 min_samples_split = 10,
#                                 min_samples_leaf = 10)

# from sklearn.linear_model import LinearRegression
# model2 = LinearRegression()
# from sklearn.neighbors import KNeighborsRegressor
# model3 = KNeighborsRegressor()
# from sklearn.ensemble import RandomForestRegressor
# model4 = RandomForestRegressor(max_depth = None,
#                                 criterion = 'mse',
#                                 min_samples_split = 10,
#                                 min_samples_leaf = 10)


# model_pipeline = [model1, model2,model3,model4]

# train_score = []
# test_score = []
# cross_scores = []
# mse = []
# rmse = []

# for model in model_pipeline:
#     model.fit(X_train, y_train)
#     train_score.append(model.score(X_train, y_train))
#     test_score.append(model.score(X_test, y_test))
#     cross_scores.append(np.mean(cross_val_score(model, X_train, y_train, cv=5)))
#     mse_calculated = mean_squared_error(y_test,model.predict(X_test))
#     mse.append(mse_calculated)
#     rmse.append(math.sqrt(mse_calculated))
# summary = {'Train Score':train_score,
#            'Test Score':test_score,
#            'Cross Score':cross_scores,
#             'mse':mse,
#           'rmse':rmse,}
# summary = pd.DataFrame(summary).T
# summary.columns = 'DecisionTree', 'LinearRegression', 'KNN', 'RandomForestRegressor'

In [18]:
# summary

Unnamed: 0,DecisionTree,LinearRegression,KNN,RandomForestRegressor
Train Score,0.9511315,0.8546515,0.9216,0.951987
Test Score,0.9217799,0.8538765,0.8805834,0.933985
Cross Score,0.9142628,0.8546433,0.8760087,0.928309
mse,1155207.0,2158049.0,1763624.0,974956.159662
rmse,1074.805,1469.03,1328.015,987.398683


In [64]:

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import math

from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(max_depth = None,
                                criterion = 'mse',
                                min_samples_split = 10,
                                min_samples_leaf = 10)

from sklearn.linear_model import LinearRegression
model2 = LinearRegression()
from sklearn.neighbors import KNeighborsRegressor
model3 = KNeighborsRegressor()
from sklearn.ensemble import RandomForestRegressor
model4 = RandomForestRegressor(max_depth = None,
                                criterion = 'mse',
                                min_samples_split = 10,
                                min_samples_leaf = 10)


model_pipeline = [model1, model2,model3,model4]

train_score = []
test_score = []
cross_scores = []
mse = []
rmse = []

for model in model_pipeline:
    model.fit(X_train_normalized, y_train)
    train_score.append(model.score(X_train_normalized, y_train))
    test_score.append(model.score(X_test_normalized, y_test))
    cross_scores.append(np.mean(cross_val_score(model, X_train_normalized, y_train, cv=5)))
    mse_calculated = mean_squared_error(y_test,model.predict(X_test))
    mse.append(mse_calculated)
    rmse.append(math.sqrt(mse_calculated))
summary = {'Train Score':train_score,
           'Test Score':test_score,
           'Cross Score':cross_scores,
            'mse':mse,
          'rmse':rmse,}
summary = pd.DataFrame(summary).T
summary.columns = 'DecisionTree', 'LinearRegression', 'KNN', 'RandomForestRegressor'

In [66]:
summary

Unnamed: 0,DecisionTree,LinearRegression,KNN,RandomForestRegressor
Train Score,0.9511315,0.8546515,0.902529,0.9519631
Test Score,0.9057183,0.8474445,0.8511175,0.9219493
Cross Score,0.9142577,0.8546433,0.8465275,0.9283022
mse,663002100.0,703453200000000.0,452131300.0,584296800.0
rmse,25748.83,26522690.0,21263.38,24172.23


## optimization

In [26]:
# Looking for the best parameters for dession tree and random forrest regression 
max_depth_check= [3,10,None]
criterion_check = ['mse']
min_samples_split_check = [2,10]
min_samples_leaf_check = [2,10]   

grid = {'max_depth': max_depth_check,
        'criterion': criterion_check,
        'min_samples_split': min_samples_split_check,
        'min_samples_leaf': min_samples_leaf_check}
# https://www.projectpro.io/recipes/create-and-optimize-baseline-decision-tree-model-for-regression

In [27]:
# Decision Tree Regressor
from sklearn.model_selection import GridSearchCV
model = DecisionTreeRegressor()
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse'], 'max_depth': [3, 10, None],
                         'min_samples_leaf': [2, 10],
                         'min_samples_split': [2, 10]})

In [28]:
grid_search.best_params_
# These were the best parameters for the decision tree

{'criterion': 'mse',
 'max_depth': None,
 'min_samples_leaf': 10,
 'min_samples_split': 10}

## Validations

In [67]:
# Loading data sets
sales_validation = pd.read_csv('./validation_for_students.csv')
sales_validation.head()

Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday
0,7,764,4,2013-12-26,0,0,0,c,1
1,19,22,3,2013-05-22,449,1,0,0,1
2,31,1087,6,2013-06-29,622,1,0,0,0
3,45,139,6,2013-08-17,314,1,0,0,0
4,56,568,1,2014-04-07,356,1,0,0,0


In [68]:
X = sales_validation.drop(['True_index'],axis=1)

In [69]:
# X['date'] = list(map(lambda x: x.replace('/','-'),X['date']))
def to_datetime(dataframe, column):

    dataframe[column] = pd.to_datetime(dataframe[column])

    dataframe['year'] = dataframe[column].dt.year
    dataframe['month'] = dataframe[column].dt.month
    dataframe['day'] = dataframe[column].dt.day
    
    dataframe = dataframe.drop(column, axis = 1)

    return dataframe

X = to_datetime(X, 'Date')

In [70]:
def reduce_state_holiday(x):

    if x == 'a':
        return 1

    elif x == 'b':
        return 1

    elif x == 'c':
        return 1

    else:
        return int(x)

X['State_holiday'] = X['State_holiday'].apply(reduce_state_holiday)

In [71]:
# DecisionTreeRegressor
predicted_sales1 = pd.Series(model1.predict(X),name='predicted_sales')

In [72]:
# LinearRegression
predicted_sales2 = pd.Series(model2.predict(X),name='predicted_sales')

In [73]:
# KNN
predicted_sales3 = pd.Series(model3.predict(X),name='predicted_sales')

In [74]:
#Random forest regressor
predicted_sales4 = pd.Series(model4.predict(X),name='predicted_sales')

In [75]:
# Predicting the sales
results_DTR = pd.concat([sales_validation, predicted_sales1],axis=1)
results_LR = pd.concat([sales_validation, predicted_sales2],axis=1)
results_KNN = pd.concat([sales_validation, predicted_sales3],axis=1)
results_RFR = pd.concat([sales_validation, predicted_sales4],axis=1)

In [76]:
final_results_DTR = results_DTR[['True_index','predicted_sales']]
final_results_LR = results_LR[['True_index','predicted_sales']]
final_results_KNN = results_KNN[['True_index','predicted_sales']]
final_results_RFR = results_RFR[['True_index','predicted_sales']]

In [77]:
final_results_DTR 

Unnamed: 0,True_index,predicted_sales
0,7,0.000000
1,19,35045.642857
2,31,35045.642857
3,45,35045.642857
4,56,35045.642857
...,...,...
71200,712004,35045.642857
71201,712018,35045.642857
71202,712020,35045.642857
71203,712023,0.000000


In [63]:
# Saving data in file formats
final_results_DTR.to_csv('DTR.csv', index=False)
final_results_LR.to_csv('LR.csv', index=False)