<a href="https://colab.research.google.com/github/PawarSneha12/BigMart-Outlet-Sales-Analysis-and-Predictions/blob/main/Big_Mart_Sales_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Importing The Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn import metrics

## Preprocessing and Data Analysis

Training dataset

In [None]:
# loading the dataset from csv file to a pandas DataFrame
data_train = pd.read_csv('/content/drive/MyDrive/9961_14084_bundle_archive/Train.csv')


In [None]:
data_train.head()

In [None]:
data_train.shape

In [None]:
# some information about the dataset
data_train.info()

Testing dataset

In [None]:
data_test = pd.read_csv('/content/drive/MyDrive/9961_14084_bundle_archive/Test.csv')

In [None]:
data_test.head()

In [None]:
data_test.shape

In [None]:
data_test.info()

In [None]:
data_train.describe()


In [None]:
data_test.describe()

Missing Values

In [None]:
data_train.isnull().sum()

check the value counts for Outlet_Size and Item_Weight:

In [None]:
data_train.Outlet_Size.value_counts()

In [None]:
data_train.Item_Weight.value_counts()

In [None]:
# filling the missing values in "Outlet_Size" with "Mode" value

#train
data_train['Outlet_Size'] = data_train['Outlet_Size'].fillna(
data_train['Outlet_Size'].dropna().mode().values[0])



In [None]:
#test
data_test['Outlet_Size'] = data_test['Outlet_Size'].fillna(
data_test['Outlet_Size'].dropna().mode().values[0])

In [None]:
#checking filled missing values
data_train['Outlet_Size'].isnull().sum(),data_train['Outlet_Size'].isnull().sum()

In [None]:
# filling the missing values in "Item_weight column" with "Mean" value

#train
data_train['Item_Weight'].fillna(data_train['Item_Weight'].dropna().mean())

#test
data_test['Item_Weight'].fillna(data_test['Item_Weight'].dropna().mean())

#checking filled missing values
data_train['Item_Weight'].isnull().sum(),data_test['Item_Weight'].isnull().sum()

### Data Visualization 📊
Univariate Plots

countplots for the categorical columns:
##### Categorical columns:
['Item_Identifier',
'Item_Fat_Content',
'Item_Type',
'Outlet_Identifier',
'Outlet_Size',
'Outlet_Location_Type',
'Outlet_Type']

In [None]:
plt.figure(figsize=(10,4))
sns.countplot(x='Outlet_Identifier', data=data_train )
plt.show()

In [None]:
plt.figure(figsize=(4,4))
sns.countplot(x='Item_Fat_Content' , data=data_train )
plt.show()

In [None]:
plt.figure(figsize=(20,10))
sns.countplot(x='Item_Type' , data=data_train)
plt.show()

In [None]:
plt.figure(figsize=(4,4))
sns.countplot(x='Outlet_Size' , data=data_train)
plt.show()

In [None]:
plt.figure(figsize=(4,4))
sns.countplot(x='Outlet_Location_Type' , data=data_train)
plt.show()

In [None]:
plt.figure(figsize=(8,4))
sns.countplot(x='Outlet_Type' , data=data_train)
plt.show()

In [None]:
# Item_Weight distribution
plt.figure(figsize=(4,4))
sns.distplot(data_train['Item_Weight'])
plt.show()

In [None]:
plt.figure(figsize=(6,6))
sns.distplot(data_train['Item_Visibility'])
plt.show()

In [None]:
# Item MRP distribution
plt.figure(figsize=(6,6))
sns.distplot(data_train['Item_MRP'])
plt.show()

In [None]:
# Item_Outlet_Sales distribution
plt.figure(figsize=(6,6))
sns.distplot(data_train['Item_Outlet_Sales'])
plt.show()

Item_Weight and Item_Outlet_Sales Analysis

In [None]:

plt.figure(figsize=(13,9))
plt.xlabel('Item_Weight')
plt.ylabel('Item_Outlet_Sales')
plt.title('Item_Weight and Item_Outlet_Sales Analysis')
sns.scatterplot(x='Item_Weight', y='Item_Outlet_Sales',hue='Item_Type', size='Item_Weight',data=data_train)

Item_Visibility and Maximum Retail Price

In [None]:
plt.figure(figsize=(7,7))
plt.xlabel('Item_Visibility')
plt.ylabel('Maximum Retail Price')
plt.title('Item_Visibility and Maximum Retail Price')
plt.plot(data_train.Item_Visibility, data_train.Item_MRP, ".",alpha = 0.3)

In [None]:
data_train['Item_Fat_Content'] = data_train['Item_Fat_Content'].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat',})

In [None]:
data_train['Item_Fat_Content'].value_counts()

In [None]:
data_train['Item_Fat_Content'] = data_train['Item_Fat_Content'].replace({'LF':'Low Fat','reg':'Regular','low fat':'Low Fat',})

In [None]:
Item_Fat_Content_pivot =\
data_train.pivot_table(index='Item_Fat_Content',values='Item_Outlet_Sales',aggfunc=np.median)

Item_Fat_Content_pivot.plot(kind='bar',color='blue',figsize=(12,7))
plt.xlabel('Item_Fat_Content')
plt.ylabel("Item_Outlet_Sales")
plt.title("Impact of Item_Fat_Content on Item_Outlet_Sales")
plt.xticks(rotation=0)
plt.show()


In [None]:
data_train.corr()

Correlation between different attributes

In [None]:
plt.figure(figsize=(35,15))
sns.heatmap(data_train.corr(),vmax=1, square=True,annot=True, cmap='viridis')
plt.title('Correlation between different attributes')
plt.show()

Feature Engineering , Selection and Transformation
Treating The Missing Values
Item_Weight

From the boxplot we plotted at the beginning, we noticed that the item_weight column is approximately normal and it is therefore helpful to replace the missing values with the Mean of the column.

In [None]:
#we will replace the NaN values with this mean
data_train['Item_Weight'].mean()


In [None]:
#missing values have been replaced with the mean using
data_train['Item_Weight'].fillna(data_train['Item_Weight'].mean(), inplace=True)

In [None]:
data_train['Outlet_Size'].value_counts()

In [None]:
data_train['Outlet_Size'].fillna('Medium', inplace=True)

In [None]:
data_train.isnull().sum() #now we do not have any null values in Outlet_Size

In [None]:
data_train.shape

In [None]:
data_test.shape

#### Dealing with our Categorical Variables
Label Encoder We will be converting all categorical variables into numeric types (Values of 0 or 1) using the LabelEncoder function since we cannot build model on them.

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

#New variable for outlet
data_train['Outlet'] = label.fit_transform(data_train['Outlet_Identifier'])
data_test['Outlet'] = label.fit_transform(data_test['Outlet_Identifier'])
varib = ['Item_Fat_Content','Outlet_Location_Type','Outlet_Size','Outlet_Type']
for i in varib:
  data_train[i] = label.fit_transform(data_train[i])
  data_test[i] = label.fit_transform(data_test[i])

In [None]:
data_train.head()

In [None]:
# Dropping irrelevant columns

tr_fe  = data_train.drop(['Item_Identifier','Outlet_Identifier','Outlet_Type'],axis=1)
te_fe = data_test.drop(['Item_Identifier','Outlet_Identifier','Outlet_Type',],axis=1)

#### Machine learning models
Linear Regression

Random Forest Regressor

Lasso Regressor

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso


In [None]:
y = data_train['Item_Outlet_Sales']
X = data_train.drop('Item_Outlet_Sales', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.8, random_state = 0)

In [None]:
X_test = data_train.drop(['Outlet_Identifier','Item_Identifier'], axis=1)

In [None]:
def cross_val(model_name,model,X,y,cv):

    scores = CVS(model, X, y, cv=cv)
    print(f'{model_name} Scores:')
    for i in scores:
        print(round(i,2))
    print(f'Average {model_name} score: {round(scores.mean(),4)}')

In [None]:
def cross_val(model_name,model,X,y,cv):

    scores = CVS(model, X, y, cv=cv)
    print(f'{model_name} Scores:')
    for i in scores:
        print(round(i,2))
    print(f'Average {model_name} score: {round(scores.mean(),4)}')

In [None]:
X_train = data_train.drop(['Item_Outlet_Sales', 'Outlet_Identifier','Item_Identifier'], axis=1)
y_train = data_train.Item_Outlet_Sales

In [None]:
X_test = data_test.drop(['Outlet_Identifier','Item_Identifier'], axis=1)

In [None]:
from sklearn.linear_model import  LinearRegression
regressor = LinearRegression()
#regressor.fit(X_train,y_train)

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)

tree.fit(X_train,y_train)
#tree_pred = tree.predict(X_test)

In [None]:
RFR= RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=100,n_jobs=4,random_state=101)
#fit
RFR.fit(X_train, y_train)
#predict
y_predict = RFR.predict(X_test)

#score variables
RFR_MAE = round(MAE(y_test, y_predict),2)
RFR_MSE = round(MSE(y_test, y_predict),2)
RFR_R_2 = round(R2(y_test, y_predict),4)
RFR_CS  = round(CVS(RFR, X, y, cv=5).mean(),4)



print(f" Mean Absolute Error: {RFR_MAE}\n")
print(f" Mean Squared Error: {RFR_MSE}\n")
print(f" R^2 Score: {RFR_R_2}\n")
cross_val(RFR,RandomForestRegressor(),X,y,5)


In [None]:
#model
LS = Lasso(alpha = 0.05)
#fit
LS.fit(X_train,y_train)

#predict
y_predict = LS.predict(X_test)

#score variables
LS_MAE = round(MAE(y_test, y_predict),2)
LS_MSE = round(MSE(y_test, y_predict),2)
LS_R_2 = round(R2(y_test, y_predict),4)
LS_CS  = round(CVS(LS, X, y, cv=5).mean(),4)

print(f" Mean Absolute Error: {LS_MAE}\n")
print(f" Mean Squared Error: {LS_MSE}\n")
print(f" R^2 Score: {LS_R_2}\n")
cross_val(LS,Lasso(alpha = 0.05),X,y,5)