In [4]:
import pandas as pd
import numpy  as np
import seaborn as sns

In [105]:

# LinearRegression()  model can be used from linear_model module
from sklearn import linear_model

# OSL() model can be used from statmodels package
import statsmodels.api as sm

# we will perform sampling using train_test_split module for training and testin set
# and we will use GridSearchCV to find the best parameters for SGDRegressor()
from sklearn.model_selection import train_test_split

from sklearn import preprocessing

# we will evaluate our models using RMSE(mean_squared_error) and Determination coefficient(r2_score)
from sklearn.metrics import mean_squared_error, r2_score

# in case you will use polinominal features
from sklearn.preprocessing import PolynomialFeatures
from sklearn import tree
from sklearn import metrics

In [37]:
test_data = pd.read_csv('test_data.csv')
train_data = pd.read_csv('train_data.csv')

In [5]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500 entries, 0 to 15499
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     15500 non-null  int64 
 1   date                   15500 non-null  object
 2   product_identifier     15500 non-null  int64 
 3   department_identifier  15500 non-null  int64 
 4   category_of_product    15500 non-null  object
 5   outlet                 15500 non-null  int64 
 6   state                  15500 non-null  object
dtypes: int64(4), object(3)
memory usage: 666.1+ KB


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395000 entries, 0 to 394999
Data columns (total 7 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   date                   395000 non-null  object
 1   product_identifier     395000 non-null  int64 
 2   department_identifier  395000 non-null  int64 
 3   category_of_product    395000 non-null  object
 4   outlet                 395000 non-null  int64 
 5   state                  395000 non-null  object
 6   sales                  395000 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 16.6+ MB


In [41]:
test_data

Unnamed: 0,id,date,product_identifier,department_identifier,category_of_product,outlet,state
0,1,2014-03-01,74,11,others,111,Maharashtra
1,2,2014-03-01,337,11,others,111,Maharashtra
2,3,2014-03-01,423,12,others,111,Maharashtra
3,4,2014-03-01,432,12,others,111,Maharashtra
4,5,2014-03-01,581,21,fast_moving_consumer_goods,111,Maharashtra
...,...,...,...,...,...,...,...
15495,15496,2014-03-31,2932,33,drinks_and_food,333,Kerala
15496,15497,2014-03-31,2935,33,drinks_and_food,333,Kerala
15497,15498,2014-03-31,3004,33,drinks_and_food,333,Kerala
15498,15499,2014-03-31,3008,33,drinks_and_food,333,Kerala


In [52]:
train_data

Unnamed: 0,date,product_identifier,department_identifier,category_of_product,outlet,state,sales
0,2012-01-01,74,11,others,111,Maharashtra,0
1,2012-01-01,337,11,others,111,Maharashtra,1
2,2012-01-01,423,12,others,111,Maharashtra,0
3,2012-01-01,432,12,others,111,Maharashtra,0
4,2012-01-01,581,21,fast_moving_consumer_goods,111,Maharashtra,0
...,...,...,...,...,...,...,...
394995,2014-02-28,2932,33,drinks_and_food,333,Kerala,2
394996,2014-02-28,2935,33,drinks_and_food,333,Kerala,8
394997,2014-02-28,3004,33,drinks_and_food,333,Kerala,0
394998,2014-02-28,3008,33,drinks_and_food,333,Kerala,0


In [53]:
LabelEncoder_ = preprocessing.LabelEncoder()

In [54]:
train_data.sales.unique()

array([  0,   1,   3,   2,   9,   5,   8,  18,  12,  28,   4,   6,  27,
         7,  10,  47,  13,  11,  32,  23,  14,  16,  19,  33,  17,  26,
        20,  87,  35,  54,  22,  15,  36,  31,  21,  25,  34,  43,  53,
        38,  30,  85,  57, 100,  52,  24,  29,  58,  63,  37,  42,  45,
        39,  72,  40,  80,  98,  56,  70,  50,  86,  46,  51,  66,  41,
        64,  81,  79,  82,  69,  48,  94,  44,  60,  68,  49,  77,  61,
        76, 119,  92,  93,  62, 124,  59,  74,  73, 174,  96,  71, 132,
       101,  55,  78,  75,  65, 105, 120, 150, 173,  67,  83,  95, 170,
       116,  90, 108, 139,  84, 293,  88, 241, 102,  97, 156, 114, 126,
       109, 118, 111,  99, 171, 242, 121, 215,  89], dtype=int64)

In [91]:
train_data.sales = LabelEncoder_.fit_transform(train_data.sales)
train_data.category_of_product = LabelEncoder_.fit_transform(train_data.category_of_product)
train_data.state = LabelEncoder_.fit_transform(train_data.state)


In [92]:
train_data.head()

Unnamed: 0,date,product_identifier,department_identifier,category_of_product,outlet,state,sales
0,2012-01-01,74,11,2,111,1,0
1,2012-01-01,337,11,2,111,1,1
2,2012-01-01,423,12,2,111,1,0
3,2012-01-01,432,12,2,111,1,0
4,2012-01-01,581,21,1,111,1,0


In [93]:
train_data.shape

(395000, 7)

In [94]:
test_data.shape

(15500, 7)

In [95]:
X = train_data[['product_identifier', 'department_identifier', 'category_of_product', 'outlet', 'state']].copy()

In [96]:
Y = train_data.sales

In [97]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state= 30)

In [98]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((276500, 5), (276500,), (118500, 5), (118500,))

In [99]:
model = tree.DecisionTreeClassifier()

In [100]:
model.fit(x_train, y_train)

DecisionTreeClassifier()

In [107]:
y_predicted = model.predict(x_test)

In [108]:
print("Accuracy of the model:", metrics.accuracy_score(y_test, y_predicted))

Accuracy of the model: 0.6253164556962025


In [None]:
tree.plot_tree(model)

[Text(132.29033376206183, 211.04470588235293, 'X[2] <= 0.5\ngini = 0.575\nsamples = 276500\nvalue = [172629, 44211, 21917, 11992, 7303, 4595, 3174, 2083\n1648, 1151, 981, 731, 645, 428, 388, 313, 261, 225\n176, 159, 154, 116, 107, 100, 90, 63, 75, 52, 55\n48, 46, 49, 45, 39, 22, 26, 24, 24, 21, 20, 22\n9, 19, 13, 9, 13, 12, 13, 9, 13, 7, 4, 9, 4\n11, 6, 5, 4, 6, 3, 7, 4, 3, 3, 3, 3, 7, 3\n6, 4, 4, 1, 5, 4, 3, 3, 2, 4, 1, 2, 5, 1\n4, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1, 2, 1\n1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1\n1, 2, 1, 1, 1, 1, 1]'),
 Text(43.064323737199686, 198.25411764705882, 'X[0] <= 1650.5\ngini = 0.711\nsamples = 93875\nvalue = [46833, 13827, 9280, 6059, 4159, 2895, 2138, 1497, 1227\n868, 795, 593, 527, 369, 331, 280, 241, 211, 165\n150, 146, 106, 102, 95, 85, 59, 70, 49, 54, 46\n45, 48, 44, 38, 22, 25, 23, 22, 19, 20, 22, 8\n19, 13, 9, 13, 12, 13, 8, 13, 7, 4, 9, 4, 11\n6, 5, 4, 6, 3, 7, 4, 3, 3, 3, 3, 7, 3, 6\n4, 4, 1, 5, 4, 3, 3, 2, 4, 1, 2, 5, 1, 4\n2, 1, 1, 1, 1, 1, 1, 1, 

In [102]:
x_train_ols =  sm.add_constant(x_train)

In [None]:
ols_model = sm.OLS(y_train, x_train, missing="drop").fit()

In [None]:
y_predicted_ols=ols_model.predict(x_test)

In [None]:
y_predicted_ols.shape, y_test.shape

In [None]:
lr_model = linear_model.LinearRegression()

In [None]:
lr_model.fit(x_train, y_train)

In [None]:
y_predicted_ls = lr_model.predict(x_test)

In [None]:
print("MSE of OLS() vs LinearRegression():\n")
mean_squared_error(y_test, y_predicted_ols), mean_squared_error(y_test, y_predicted_ls)

In [None]:
print("R-square of OLS() vs LinearRegression():\n")
r2_score(y_test, y_predicted_ols), r2_score(y_test, y_predicted_ls)

In [None]:
ols_model.summary()