In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import dtale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor  # For regression tasks
from xgboost import XGBRegressor  # For regression tasks
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [2]:
train_df=pd.read_csv("Train.csv")
test_df=pd.read_csv("Test.csv")


train_df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# data cleansing

In [3]:
cols = ['Item_Visibility','Item_MRP','Item_Weight']
x = train_df[cols]

impute_it = IterativeImputer()
x = impute_it.fit_transform(x)### treating null weight values with iterative imputer

In [4]:
train_df['Item_Weight'] = x[:,2]


In [5]:
nan_counts = train_df.isna().sum()

# Display the counts
print(nan_counts)

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [6]:
train_df.loc[train_df['Item_Visibility']==0, 'Item_Visibility'] = train_df['Item_Visibility'].mean() #trating zeros of item visibilty

In [7]:
train_df = train_df[train_df['Item_Fat_Content'] != "LF"]
train_df = train_df[train_df['Item_Fat_Content'] != "reg"]
train_df = train_df[train_df['Item_Fat_Content'] != "fat"]
train_df = train_df[train_df['Item_Fat_Content'] != "low"]


#####treating wrong vals

In [8]:
train_df['Outlet_Establishment_Year']=train_df['Outlet_Establishment_Year']-1985 ##1985 oldest market as reference point

# Treating outlet size 

In [9]:
# inspired by https://www.kaggle.com/code/abelwahabbahaa/bigmartsalesprediction-using-regressionmodels/notebook

In [10]:
train_df[train_df["Outlet_Size"].isnull() ==True]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.200,Regular,0.066132,Fruits and Vegetables,182.0950,OUT010,13,,Tier 3,Grocery Store,732.3800
8,FDH17,16.200,Regular,0.016687,Frozen Foods,96.9726,OUT045,17,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.200,Regular,0.094450,Frozen Foods,187.8214,OUT017,22,,Tier 2,Supermarket Type1,4710.5350
25,NCD06,13.000,Low Fat,0.099887,Household,45.9060,OUT017,22,,Tier 2,Supermarket Type1,838.9080
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,13,,Tier 3,Grocery Store,178.4344
...,...,...,...,...,...,...,...,...,...,...,...,...
8502,NCH43,8.420,Low Fat,0.070712,Household,216.4192,OUT045,17,,Tier 2,Supermarket Type1,3020.0688
8508,FDW31,11.350,Regular,0.043246,Fruits and Vegetables,199.4742,OUT045,17,,Tier 2,Supermarket Type1,2587.9646
8509,FDG45,8.100,Low Fat,0.214306,Fruits and Vegetables,213.9902,OUT010,13,,Tier 3,Grocery Store,424.7804
8514,FDA01,15.000,Regular,0.054489,Canned,57.5904,OUT045,17,,Tier 2,Supermarket Type1,468.7232


In [11]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Location_Type'].value_counts()


Outlet_Location_Type
Tier 2    1772
Tier 3     520
Name: count, dtype: int64

In [12]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Identifier'].value_counts()


Outlet_Identifier
OUT045    892
OUT017    880
OUT010    520
Name: count, dtype: int64

In [13]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Type'].value_counts()


Outlet_Type
Supermarket Type1    1772
Grocery Store         520
Name: count, dtype: int64

In [14]:
#type 1 supermarket and tier 2 same number of shops

In [15]:
## for both ['OUT045', 'OUT017']
train_df[(train_df["Outlet_Location_Type"] =="Tier 2" ) & (train_df["Outlet_Type"] =="Supermarket Type1")]['Outlet_Size'].value_counts()



Outlet_Size
Small    882
Name: count, dtype: int64

In [16]:
## so for ['OUT010']
train_df[(train_df["Outlet_Location_Type"] =="Tier 3" ) & (train_df["Outlet_Type"] =="Grocery Store")]['Outlet_Size'].value_counts()

Series([], Name: count, dtype: int64)

In [17]:
## no common vals
train_df[train_df["Outlet_Type"] == "Grocery Store"]['Outlet_Size'].value_counts()


Outlet_Size
Small    504
Name: count, dtype: int64

In [18]:
train_df[(train_df['Outlet_Location_Type'] == "Tier 3") ]['Outlet_Size'].value_counts()


Outlet_Size
Medium    1760
High       887
Name: count, dtype: int64

In [19]:
train_df.loc[(train_df['Outlet_Size'].isnull() ==True) &(train_df['Outlet_Identifier'] != 'OUT010'),"Outlet_Size"] = "Small"

In [20]:
train_df.loc[(train_df['Outlet_Size'].isnull() ==True) &(train_df['Outlet_Identifier'] == 'OUT010'),"Outlet_Size"] = "Medium"

In [21]:
train_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,14,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,24,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,14,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.066132,Fruits and Vegetables,182.0950,OUT010,13,Medium,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.066132,Household,53.8614,OUT013,2,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,2,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,17,Small,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,19,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,24,Medium,Tier 3,Supermarket Type2,1845.5976


# Selection

In [22]:
train_df['Item_ID_Type']=train_df['Item_Identifier'].apply(lambda x: "Food" if x[:2] =='FD' 
                                               else( "Drink" if x[:2]=='DR' else "Non-Consumable") )
train_df['Item_ID_Type'].value_counts()

Item_ID_Type
Food              5826
Non-Consumable    1505
Drink              759
Name: count, dtype: int64

In [23]:
train_df.drop(columns=['Item_Type'], inplace=True)


In [24]:
lencod = LabelEncoder()
ids = ['Item_Identifier','Outlet_Identifier','Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_ID_Type']
for i in ids:
    train_df[i]=lencod.fit_transform(train_df[i])

In [25]:
train_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_ID_Type
0,156,9.300,0,0.016047,249.8092,9,14,1,0,1,3735.1380,1
1,8,5.920,1,0.019278,48.2692,3,24,1,2,2,443.4228,0
2,662,17.500,0,0.016760,141.6180,9,14,1,0,1,2097.2700,1
3,1121,19.200,1,0.066132,182.0950,0,13,1,2,0,732.3800,1
4,1297,8.930,0,0.066132,53.8614,1,2,0,2,1,994.7052,2
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,370,6.865,0,0.056783,214.5218,1,2,0,2,1,2778.3834,1
8519,897,8.380,1,0.046982,108.1570,7,17,2,1,1,549.2850,1
8520,1357,10.600,0,0.035186,85.1224,6,19,2,1,1,1193.1136,2
8521,681,7.210,1,0.145221,103.1332,3,24,1,2,2,1845.5976,1


In [26]:
y = train_df['Item_Outlet_Sales'].copy()
X = train_df.drop(columns='Item_Outlet_Sales')

### first will split train to 4 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=20)


In [27]:
lg = LinearRegression()

lg.fit(X_train, y_train)
LinearRegression()

In [28]:
print(f"Linear Regression Score: {lg.score(X_train, y_train)}")


Linear Regression Score: 0.4986224807980545


In [31]:
y_pred = lg.predict(X_test)
r_squared = r2_score(y_test, y_pred)

print("R-squared:", r_squared)


R-squared: 0.5106323632604176


In [32]:
forest_reg = RandomForestRegressor()

forest_reg.fit(X_train, y_train)

#checking score
print(f"score of Random Forest Regressor model: {forest_reg.score(X_train, y_train)}")

score of Random Forest Regressor model: 0.9373442940309943


In [33]:
y_pred = forest_reg.predict(X_test)
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)


R-squared: 0.5490698199017022


In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
grid_search.best_params_


{'max_features': 2, 'n_estimators': 30}