In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Now we load the csv file. 

In [None]:
h = pd.read_csv("../input/big-mart-sale-forecast/train.csv")

Let's inspect how many rows and columns the dataset has. 

In [None]:
h.shape

Now we inspect the first few entries of the dataset. 

In [None]:
h.head()

Let's examine the data types of variables in the dataset. 

In [None]:
h.dtypes

In [None]:
h["Item_Fat_Content"].value_counts()

We observe that the LF and low fat refer to Low Fat while Regular and reg both refer to the same fat content. So we fix this. 

In [None]:
h["Item_Fat_Content"] = h["Item_Fat_Content"].str.replace("LF", "Low Fat")
h["Item_Fat_Content"] = h["Item_Fat_Content"].str.replace("low fat", "Low Fat")
h["Item_Fat_Content"] = h["Item_Fat_Content"].str.replace("reg", "Regular")

In [None]:
h["Item_Fat_Content"].value_counts()

In [None]:
sns.countplot('Item_Fat_Content', data=h, palette='deep')

plt.show()

We observe that there Low Fat value is common for most items. 

In [None]:
h["Item_Type"].value_counts()

In [None]:
g = sns.factorplot("Item_Type", data=h, aspect=1.5, kind="count", color="r")
g.set_xticklabels(rotation=90)

We observe that 'Fruits and Vegetables' and 'Snack Foods' are the most commonly occuring item types. 'Seafood' and 'Breakfast' are the least commonly ocuring item types. 

Let's find out the number of outlet's that are involved in this dataset. 

In [None]:
len(h["Outlet_Identifier"].unique())

We find out the numerical distribution of the size of the outlets.

In [None]:
h["Outlet_Size"].value_counts()

In [None]:
sns.countplot('Outlet_Size', data=h, palette='rocket')

plt.show()

It's evident from the countplot above that small and medium are sizes of most of the outlets. A minority of the outlets are high sized. 

Let us find out the numerical proportion of the categorical types in Outlet locations.

In [None]:
h["Outlet_Location_Type"].value_counts()

In [None]:
sns.countplot('Outlet_Location_Type', data=h, palette='Set3')

plt.show()

From the above we concur that the least number of stores are in tier 1 regions, after that the least number of stores are in tier 2 region. Most of the stores are in tier 3 region. 

In [None]:
numbers = list(h.select_dtypes(['float64', 'int64']).keys())
numbers.remove('Outlet_Establishment_Year')

In [None]:
h[numbers].hist(figsize=(20,10), color='green', edgecolor='white')

plt.show()

display(h[numbers].describe())

In [None]:
sns.boxplot(x=h["Item_Weight"])

We observe that the lightest item is 4.55 and the heaviest item is 21.35. While the middle 50% of the items are from 8.77 to 16.85. The median weight of an item is 12.60.  

In [None]:
sns.boxplot(x=h["Item_Visibility"])

Above is the plot of *Item_Visibility*. It displays the % of total display area of all products in a store allocated to the particular product. While the least visible products have no visibility at all and the most visible products is 0.33. The middle 50% of the products are between 0.03 and 0.09. The median visibility of an item is 0.05. Most products don't have much visibility. 

In [None]:
sns.boxplot(x=h["Item_MRP"])

Above is the boxlot of *Item_MRP*. MRP stands for maximum retail price of a item. Cheapest item is worth 31.29 and the most expensive item is worth 266.89. The median worth of an item is 143.01 and the middle 50% of the items are worth 93.84 to 185.65. 

In [None]:
h["Outlet_Size"].value_counts()

We observe that the outlet size is an ordinal variable. We replace Small with 1, Medium with 2 and High with 3. 

In [None]:
h["Outlet_Size"] = h["Outlet_Size"].replace("Medium", 2)
h["Outlet_Size"] = h["Outlet_Size"].replace("Small", 1)
h["Outlet_Size"] = h["Outlet_Size"].replace("High", 3)

In [None]:
h["Outlet_Size"].value_counts()

We observe that our attempt has been successful. 

In [None]:
h["Outlet_Location_Type"].value_counts()

We observe that outlet location type too is an ordinal variable. So we replace Tier 1, Tier 2 and Tier 3 with 1, 2 and 3 respectively. 

In [None]:
h["Outlet_Location_Type"] = h["Outlet_Location_Type"].replace("Tier 1", 1)
h["Outlet_Location_Type"] = h["Outlet_Location_Type"].replace("Tier 2", 2)
h["Outlet_Location_Type"] = h["Outlet_Location_Type"].replace("Tier 3", 3)

In [None]:
h.head()

In [None]:
h["Item_Fat_Content"].value_counts()

We observe that this variable too can be considered an ordinal variable. 

In [None]:
h["Item_Fat_Content"] = h["Item_Fat_Content"].replace("Low Fat", 1)
h["Item_Fat_Content"] = h["Item_Fat_Content"].replace("Regular", 2)

In [None]:
h.head()

In [None]:
h["Item_Type"].value_counts()

We observe that this variable is a categorical variable with many values. So we perform one hot encoding. 

In [None]:
h = pd.concat([h, pd.get_dummies(h.Item_Type, prefix = 'Item_Type') ] , axis = 1)
h.head()

Having applied one hot encoding to the item type variable. We drop the column. 

In [None]:
h = h.drop(['Item_Type'], axis=1)


In [None]:
h.head()

Now we apply one hot encoding to the outlet type variable. 

In [None]:
h = pd.concat( [h, pd.get_dummies(h.Outlet_Type, prefix = 'Outlet_Type') ] , axis = 1)
h.head()

In [None]:
h = h.drop(['Outlet_Type'], axis=1)

Now we drop the Item Identifier and Outlet Identifier variables as they are of no use to use in any sense. They are constructs that won't help us in prediction. 

In [None]:
h.head()

In [None]:
h = h.drop(['Item_Identifier', 'Outlet_Identifier'], axis=1)

In [None]:
h.head()

We finally look at the missing values.

In [None]:
h.isnull().sum()

We observe that two variables *Item_Weight* and *Outlet_Size* have missing values. We use K-nearest neighbours algorithm to cluster datapoints and impute missing values. 

In [None]:
#u = h[["Item_Weight", "Item_Visibility", "Item_MRP", "Outlet_Establishment_Year", "Item_Outlet_Sales"]]
#df.drop(['B', 'C'], axis=1)
from sklearn.impute import KNNImputer


imputer = KNNImputer(n_neighbors=5)
print(imputer.fit_transform(h))
DF = pd.DataFrame(imputer.fit_transform(h), columns = h.columns) # We assing the new dataset with no missing values in DF
#h["Item_Weight"] = u["Item_Weight"]

In [None]:
DF.isnull().sum()

We observe there are no missing values in the new dataset. 

In [None]:
target_col = "Item_Outlet_Sales"
X = DF.loc[:, DF.columns != target_col]
y = DF.loc[:, target_col]

We split the datset into X and y. X consists of the predictor variables and y is the value we are trying to predict. In the below code, we are trying to split the values for the purpose of training and evaluating the model. 

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.30, 
                                                    random_state=42)

Our dataset has a lot of features(columns). We would like to narrow down the number of features so as to make our findings more intuitive and easily understandable. 

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from scipy.stats.stats import pearsonr
out_list = []
for column in X_train.columns:
    corr_tuple = pearsonr(X_train[column], y_train)
    out_list.append([column, corr_tuple[0], corr_tuple[1]])
corr_df = pd.DataFrame(out_list, columns=["Features", "Correlation", "P-Value"])
corr_df.sort_values(by=['P-Value'], inplace=True)
corr_df.head()

Above are the features that are the most important for our dataset. Now we only keep theses predictor variables to train and test the model. 

In [None]:
X_train = X_train[["Item_MRP", "Outlet_Type_Grocery Store", "Outlet_Type_Supermarket Type3", "Item_Visibility", "Outlet_Type_Supermarket Type1"]]
X_train.head()

In [None]:
X_test = X_test[["Item_MRP", "Outlet_Type_Grocery Store", "Outlet_Type_Supermarket Type3", "Item_Visibility", "Outlet_Type_Supermarket Type1"]]
X_test.head()

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()  
regressor.fit(X_train, y_train)
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)

Above is the intercept and the co-efficient of the different predictor variables. 

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
coeffecients = pd.DataFrame(regressor.coef_,X_test.columns)
coeffecients.columns = ['Coeffecient']
coeffecients

Above are the co-efficients of the different predictor variables in a tabular format. 

From the above table we can conclude that when the MRP of an item is increased by 1 unit then the sales of that product in a particular store increases by 15.67 units provided all other variables are kept constant. Also, if the product is in the outlet type of a grocery store then it's sales goes down by -1644.72 provided all other variables are kept constant. However, if the oulet type is supermarket type 3 then the sales of a particular product goes up by 1681.63 provided all other variables are kept constant. If the outlet type is supermarket type 1 then the sales a particular product goes up by 310.79 provided all the other variables are kept constant. If the item visibility is improved by one unit then the sales of a particular product in a store goes down by -340.03 provided all other variables remain constant. 

In [None]:
from sklearn import metrics
import numpy as np
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

From the above we observe that the RMSE of the prediction conducted by the model is 1137.11.  