## Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error

## Importing and preprocessing the datasets

#### Importing the datasets and viewing them to understand their structures

The TRAIN.csv file contain dataset for training the model

In [2]:
train = pd.read_csv('TRAIN.csv')

In [3]:
train.head(10)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52
5,T1000006,249,S1,L3,R2,2018-01-01,1,Yes,39,34211.22
6,T1000007,248,S1,L1,R2,2018-01-01,1,Yes,40,35352.66
7,T1000008,247,S1,L1,R3,2018-01-01,1,Yes,64,52650.0
8,T1000009,246,S3,L1,R3,2018-01-01,1,Yes,62,42633.78
9,T1000010,254,S4,L1,R1,2018-01-01,1,Yes,87,62572.8


In [4]:
train.shape

(188340, 10)

This dataset contain 188340 rows and 10 columns

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188340 entries, 0 to 188339
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   ID             188340 non-null  object 
 1   Store_id       188340 non-null  int64  
 2   Store_Type     188340 non-null  object 
 3   Location_Type  188340 non-null  object 
 4   Region_Code    188340 non-null  object 
 5   Date           188340 non-null  object 
 6   Holiday        188340 non-null  int64  
 7   Discount       188340 non-null  object 
 8   #Order         188340 non-null  int64  
 9   Sales          188340 non-null  float64
dtypes: float64(1), int64(3), object(6)
memory usage: 14.4+ MB


This dataset contain 3 integer, 1 float and 6 object data type columns.

TEST.csv contain dataset of the inputs for the model to make predictions

In [6]:
test = pd.read_csv('TEST.csv')

In [7]:
test.head(10)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount
0,T1188341,171,S4,L2,R3,2019-06-01,0,No
1,T1188342,172,S1,L1,R1,2019-06-01,0,No
2,T1188343,173,S4,L2,R1,2019-06-01,0,No
3,T1188344,174,S1,L1,R4,2019-06-01,0,No
4,T1188345,170,S1,L1,R2,2019-06-01,0,No
5,T1188346,175,S4,L2,R1,2019-06-01,0,No
6,T1188347,176,S4,L2,R3,2019-06-01,0,No
7,T1188348,169,S1,L2,R4,2019-06-01,0,No
8,T1188349,14,S3,L2,R4,2019-06-01,0,No
9,T1188350,177,S2,L1,R4,2019-06-01,0,No


In [8]:
test.shape

(22265, 8)

This dataset contain 22265 rows and 8 columns

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22265 entries, 0 to 22264
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             22265 non-null  object
 1   Store_id       22265 non-null  int64 
 2   Store_Type     22265 non-null  object
 3   Location_Type  22265 non-null  object
 4   Region_Code    22265 non-null  object
 5   Date           22265 non-null  object
 6   Holiday        22265 non-null  int64 
 7   Discount       22265 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.4+ MB


There are 2 integers and 6 object data type columns

SAMPLE.csv contains the outcomes expected from the model

In [10]:
sample = pd.read_csv('SAMPLE.csv')

In [11]:
sample.head(10)

Unnamed: 0,ID,Sales
0,T1188341,42275
1,T1188342,42275
2,T1188343,42275
3,T1188344,42275
4,T1188345,42275
5,T1188346,42275
6,T1188347,42275
7,T1188348,42275
8,T1188349,42275
9,T1188350,42275


In [12]:
sample.shape

(22265, 2)

This dataset contains 22265 rows and 2 columns

In [13]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22265 entries, 0 to 22264
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      22265 non-null  object
 1   Sales   22265 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 348.0+ KB


It contains one column of integer data type and one of object data type

#### Preprocessing the datasets

In [14]:
train.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [15]:
train.duplicated().sum()

0

Train detaset does not have null or duplicate values

In [16]:
test.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
dtype: int64

In [17]:
test.duplicated().sum()

0

Test dataset is free of null and duplicated values

In [18]:
sample.isnull().sum()

ID       0
Sales    0
dtype: int64

In [19]:
sample.duplicated().sum()

0

Sample dataset does not contain any null or duplicated value

## Model training and Evaluation

#### Model training

The model used for prediction is linear regression

In [20]:
model = LinearRegression()

Preparing the training and testing datasets for the model

In [21]:
X_train = train.drop(['ID','Date','#Order','Sales'], axis = 1)
Y_train = train[['Sales']]
X_test = test.drop(['ID','Date'], axis = 1)
Y_test = sample[['Sales']]

The datasets contains some columns with object type data. This data is converted into integer type using label encoder

In [22]:
def objcol(df):
    object_columns = df.select_dtypes(include='object').columns
    return object_columns
def encode(df):
    columns = objcol(df)
    label_encoder = LabelEncoder()
    for col in columns:
        df[col] = label_encoder.fit_transform(df[col])

encode(X_train)
encode(Y_train)
encode(X_test)
encode(Y_test)

Training the model

In [23]:
model.fit(X_train, Y_train)

#### Model Evaluation

Making predictions for the test inputs

In [24]:
Y_pred = model.predict(X_test)

Analysing the performance of model using metrics like mean absolute error, mean sqaured error and root mean squared error

In [25]:
print('Mean abolute error is:', mean_absolute_error(Y_test, Y_pred))
print('Mean squared error is:', mean_squared_error(Y_test, Y_pred))
print('Root mean squared error is:', np.sqrt(mean_squared_error(Y_test, Y_pred)))

Mean abolute error is: 9954.982019559262
Mean squared error is: 139585977.43402377
Root mean squared error is: 11814.650965391393
