In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import dtale
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor  # For regression tasks
from xgboost import XGBRegressor  # For regression tasks
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


In [2]:
train_df=pd.read_csv("Train.csv")
test_df=pd.read_csv("Test.csv")


train_df.head()


Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


# data cleansing

In [3]:
cols = ['Item_Visibility','Item_MRP','Item_Weight']
x = train_df[cols]

impute_it = IterativeImputer()
x = impute_it.fit_transform(x)### treating null weight values with iterative imputer

In [4]:
train_df['Item_Weight'] = x[:,2]


In [5]:
nan_counts = train_df.isna().sum()

# Display the counts
print(nan_counts)

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [6]:
train_df.loc[train_df['Item_Visibility']==0, 'Item_Visibility'] = train_df['Item_Visibility'].mean() #trating zeros of item visibilty

In [7]:
train_df['Item_Fat_Content']= train_df['Item_Fat_Content'].apply(lambda x: "Regular" if x=='reg' or x == 'Regular' else 'Low Fat')


#####treating wrong vals

In [8]:
train_df['Outlet_Establishment_Year']=2023-train_df['Outlet_Establishment_Year']

# Treating outlet size 

In [9]:
# inspired by https://www.kaggle.com/code/abelwahabbahaa/bigmartsalesprediction-using-regressionmodels/notebook

In [10]:
train_df[train_df["Outlet_Size"].isnull() ==True]

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
3,FDX07,19.200,Regular,0.066132,Fruits and Vegetables,182.0950,OUT010,25,,Tier 3,Grocery Store,732.3800
8,FDH17,16.200,Regular,0.016687,Frozen Foods,96.9726,OUT045,21,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.200,Regular,0.094450,Frozen Foods,187.8214,OUT017,16,,Tier 2,Supermarket Type1,4710.5350
25,NCD06,13.000,Low Fat,0.099887,Household,45.9060,OUT017,16,,Tier 2,Supermarket Type1,838.9080
28,FDE51,5.925,Regular,0.161467,Dairy,45.5086,OUT010,25,,Tier 3,Grocery Store,178.4344
...,...,...,...,...,...,...,...,...,...,...,...,...
8502,NCH43,8.420,Low Fat,0.070712,Household,216.4192,OUT045,21,,Tier 2,Supermarket Type1,3020.0688
8508,FDW31,11.350,Regular,0.043246,Fruits and Vegetables,199.4742,OUT045,21,,Tier 2,Supermarket Type1,2587.9646
8509,FDG45,8.100,Low Fat,0.214306,Fruits and Vegetables,213.9902,OUT010,25,,Tier 3,Grocery Store,424.7804
8514,FDA01,15.000,Regular,0.054489,Canned,57.5904,OUT045,21,,Tier 2,Supermarket Type1,468.7232


In [11]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Location_Type'].value_counts()


Outlet_Location_Type
Tier 2    1855
Tier 3     555
Name: count, dtype: int64

In [12]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Identifier'].value_counts()


Outlet_Identifier
OUT045    929
OUT017    926
OUT010    555
Name: count, dtype: int64

In [13]:
train_df[train_df["Outlet_Size"].isnull() ==True]['Outlet_Type'].value_counts()


Outlet_Type
Supermarket Type1    1855
Grocery Store         555
Name: count, dtype: int64

In [14]:
#type 1 supermarket and tier 2 same number of shops

In [15]:
## for both ['OUT045', 'OUT017']
train_df[(train_df["Outlet_Location_Type"] =="Tier 2" ) & (train_df["Outlet_Type"] =="Supermarket Type1")]['Outlet_Size'].value_counts()



Outlet_Size
Small    930
Name: count, dtype: int64

In [16]:
## so for ['OUT010']
train_df[(train_df["Outlet_Location_Type"] =="Tier 3" ) & (train_df["Outlet_Type"] =="Grocery Store")]['Outlet_Size'].value_counts()

Series([], Name: count, dtype: int64)

In [17]:
## no common vals
train_df[train_df["Outlet_Type"] == "Grocery Store"]['Outlet_Size'].value_counts()


Outlet_Size
Small    528
Name: count, dtype: int64

In [18]:
train_df[(train_df['Outlet_Location_Type'] == "Tier 3") ]['Outlet_Size'].value_counts()


Outlet_Size
Medium    1863
High       932
Name: count, dtype: int64

In [19]:
train_df.loc[(train_df['Outlet_Size'].isnull() ==True) &(train_df['Outlet_Identifier'] != 'OUT010'),"Outlet_Size"] = "Small"

In [20]:
train_df.loc[(train_df['Outlet_Size'].isnull() ==True) &(train_df['Outlet_Identifier'] == 'OUT010'),"Outlet_Size"] = "Medium"

In [21]:
train_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,24,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,14,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,24,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.066132,Fruits and Vegetables,182.0950,OUT010,25,Medium,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.066132,Household,53.8614,OUT013,36,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,36,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,21,Small,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,19,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,14,Medium,Tier 3,Supermarket Type2,1845.5976


# Selection

In [22]:
train_df['Item_ID_Type']=train_df['Item_Identifier'].apply(lambda x: "Food" if x[:2] =='FD' 
                                               else( "Drink" if x[:2]=='DR' else "Non-Consumable") )
train_df['Item_ID_Type'].value_counts()

Item_ID_Type
Food              6125
Non-Consumable    1599
Drink              799
Name: count, dtype: int64

In [23]:
train_df.loc[train_df['Item_ID_Type'] == "Non-Consumable", 'Item_Fat_Content'] = "non edible"


In [24]:
train_df.drop(columns=['Item_Type'], inplace=True)
train_df.drop(columns=['Item_Identifier'], inplace=True)


In [25]:
lencod = LabelEncoder()
ids = ['Outlet_Identifier','Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_ID_Type']
for i in ids:
    train_df[i]=lencod.fit_transform(train_df[i])

In [26]:
train_df

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_ID_Type
0,9.300,0,0.016047,249.8092,9,24,1,0,1,3735.1380,1
1,5.920,1,0.019278,48.2692,3,14,1,2,2,443.4228,0
2,17.500,0,0.016760,141.6180,9,24,1,0,1,2097.2700,1
3,19.200,1,0.066132,182.0950,0,25,1,2,0,732.3800,1
4,8.930,2,0.066132,53.8614,1,36,0,2,1,994.7052,2
...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,214.5218,1,36,0,2,1,2778.3834,1
8519,8.380,1,0.046982,108.1570,7,21,2,1,1,549.2850,1
8520,10.600,2,0.035186,85.1224,6,19,2,1,1,1193.1136,2
8521,7.210,1,0.145221,103.1332,3,14,1,2,2,1845.5976,1


In [27]:
from sklearn.preprocessing import OneHotEncoder

lb = OneHotEncoder()
cat_cols = ['Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_ID_Type']

train_df = pd.get_dummies(train_df,columns=cat_cols)


In [28]:
y = train_df['Item_Outlet_Sales'].copy()
X = train_df.drop(columns='Item_Outlet_Sales')

### first will split train to 4 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=20)


In [29]:
train_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Outlet_Size_0,...,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Type_0,Outlet_Type_1,Outlet_Type_2,Outlet_Type_3,Item_ID_Type_0,Item_ID_Type_1,Item_ID_Type_2
0,9.300,0.016047,249.8092,9,24,3735.1380,True,False,False,False,...,True,False,False,False,True,False,False,False,True,False
1,5.920,0.019278,48.2692,3,14,443.4228,False,True,False,False,...,False,False,True,False,False,True,False,True,False,False
2,17.500,0.016760,141.6180,9,24,2097.2700,True,False,False,False,...,True,False,False,False,True,False,False,False,True,False
3,19.200,0.066132,182.0950,0,25,732.3800,False,True,False,False,...,False,False,True,True,False,False,False,False,True,False
4,8.930,0.066132,53.8614,1,36,994.7052,False,False,True,True,...,False,False,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,1,36,2778.3834,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
8519,8.380,0.046982,108.1570,7,21,549.2850,False,True,False,False,...,False,True,False,False,True,False,False,False,True,False
8520,10.600,0.035186,85.1224,6,19,1193.1136,False,False,True,False,...,False,True,False,False,True,False,False,False,False,True
8521,7.210,0.145221,103.1332,3,14,1845.5976,False,True,False,False,...,False,False,True,False,False,True,False,False,True,False


# LN

In [30]:
lg = LinearRegression()

lg.fit(X_train, y_train)
LinearRegression()

In [31]:
print(f"Linear Regression Score: {lg.score(X_train, y_train)}")


Linear Regression Score: 0.5653400266659474


In [32]:
y_pred = lg.predict(X_test)
r_squared = r2_score(y_test, y_pred)

print("R-squared:", r_squared)


R-squared: 0.5584900639072357


# RF


In [33]:
forest_reg = RandomForestRegressor()

forest_reg.fit(X_train, y_train)

#checking score
print(f"score of Random Forest Regressor model: {forest_reg.score(X_train, y_train)}")

score of Random Forest Regressor model: 0.9365903205658628


In [34]:
y_pred = forest_reg.predict(X_test)
r_squared = r2_score(y_test, y_pred)
print("R-squared:", r_squared)


R-squared: 0.5484023770186618


# Ridge and Lasso

In [35]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
coefficients = ridge.coef_
print("Coefficients:", coefficients)
r_squared = r2_score(y_test, y_pred)

print("R-squared:", r_squared)

Mean Squared Error: 1273743.611227016
Coefficients: [   -3.14750864   142.76157777    15.73702723    22.72503984
   -49.0717286    -19.51829194    25.89056907    -6.37227713
   860.01734013  -431.54622533  -428.47111479   277.59999462
    41.05620369  -318.6561983  -1553.89735945  -333.6798246
  -472.07116536  2359.64834941   -16.03327129    22.40554842
    -6.37227713]
R-squared: 0.5584230412464379


In [36]:
lasso = Lasso(alpha=1.0)

# Fit the model to the training data
lasso.fit(X_train, y_train)

# Make predictions on the test data
y_pred = lasso.predict(X_test)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
r_squared = r2_score(y_test, y_pred)

print("R-squared:", r_squared)

Mean Squared Error: 1273415.1286871515
R-squared: 0.5585369184189541


# XGB

In [37]:
reg = XGBRegressor(n_estimators=100, random_state=42)
reg.fit(X_train, y_train)  # For regression tasks
y_pred = reg.predict(X_test)  # For regression tasks
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
r_squared = r2_score(y_test, y_pred)

print("R-squared:", r_squared)

Mean Squared Error: 1328655.9233214527
R-squared: 0.539386233870871


# now for the test data

In [38]:
def pipline(in_df):
    #so "Outlet_Size" of ['OUT045', 'OUT017'] will be >> "Small" and last one will be  'Medium'
    in_df.loc[(in_df['Outlet_Size'].isnull() ==True) &(in_df['Outlet_Identifier'] != 'OUT010'),"Outlet_Size"] = "Small"
    in_df.loc[(in_df['Outlet_Size'].isnull() ==True) &(in_df['Outlet_Identifier'] == 'OUT010'),"Outlet_Size"] = "Medium"
    
    ### tryin another approach for handling missing in item weights
    cols = ['Item_Visibility','Item_MRP','Item_Weight']
    x = in_df[cols]
    impute_it = IterativeImputer()
    x = impute_it.fit_transform(x)
    in_df['Item_Weight'] = x[:,2]
    
    ## change the minimum values of "Item_Visibility" by giving them mean value
    in_df.loc[in_df['Item_Visibility']==0, 'Item_Visibility'] = in_df['Item_Visibility'].mean()
    
    #extract number of years for each outlet 
    in_df['Outlet_Establishment_Year'] = 2023- in_df['Outlet_Establishment_Year']
    

    ## adjusting values of fat_content
    in_df['Item_Fat_Content']= in_df['Item_Fat_Content'].apply(lambda x: "Regular" if x=='reg' or x == 'Regular' else 'Low Fat')
    
    ## exctracting items_type_classification column from Item_identitfier
    in_df['Item_ID_Type']=in_df['Item_Identifier'].apply(lambda x: "Food" if x[:2] =='FD' 
                                               else( "Drink" if x[:2]=='DR' else "Non-Consumable") )
    in_df.loc[in_df['Item_ID_Type'] == "Non-Consumable", 'Item_Fat_Content'] = "non edible"

    
    ## delete "Outlet_Establishment_Year" and 'Item_Type' columns
    in_df.drop(columns=['Item_Type'], inplace=True)
    in_df.drop(columns=['Item_Identifier'], inplace=True)
    
    
    
    #start encoding columns
    ##start with labelencoding
    ids = ['Outlet_Identifier','Item_Fat_Content','Outlet_Size'
           ,'Outlet_Location_Type','Outlet_Type','Item_ID_Type']
    for i in ids:
        in_df[i]=lencod.fit_transform(in_df[i])
        
    
    ### using get_dummies to make one hot encoding for ther other categorical columns
    cat_cols = ['Item_Fat_Content','Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_ID_Type']
    in_df = pd.get_dummies(in_df,columns=cat_cols)
    
    out_df = in_df.copy()
    return out_df

In [39]:
mod_test = pipline(test_df)


In [40]:
mod_test

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Outlet_Size_0,Outlet_Size_1,...,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Type_0,Outlet_Type_1,Outlet_Type_2,Outlet_Type_3,Item_ID_Type_0,Item_ID_Type_1,Item_ID_Type_2
0,20.750000,0.007565,107.8622,9,24,True,False,False,False,True,...,True,False,False,False,True,False,False,False,True,False
1,8.300000,0.038428,87.3198,2,16,False,True,False,False,False,...,False,True,False,False,True,False,False,False,True,False
2,14.600000,0.099575,241.7538,0,25,False,False,True,False,True,...,False,False,True,True,False,False,False,False,False,True
3,7.315000,0.015388,155.0340,2,16,True,False,False,False,False,...,False,True,False,False,True,False,False,False,True,False
4,13.021946,0.118599,234.2300,5,38,False,True,False,False,True,...,False,False,True,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5676,10.500000,0.013496,141.3154,8,26,False,True,False,False,False,...,True,False,False,False,True,False,False,False,True,False
5677,7.600000,0.142991,169.1448,3,14,False,True,False,False,True,...,False,False,True,False,False,True,False,False,True,False
5678,10.000000,0.073529,118.7440,7,21,False,False,True,False,False,...,False,True,False,False,True,False,False,False,False,True
5679,15.300000,0.065684,214.6218,2,16,False,True,False,False,False,...,False,True,False,False,True,False,False,False,True,False


In [41]:
train_df

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,Item_Fat_Content_0,Item_Fat_Content_1,Item_Fat_Content_2,Outlet_Size_0,...,Outlet_Location_Type_0,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Type_0,Outlet_Type_1,Outlet_Type_2,Outlet_Type_3,Item_ID_Type_0,Item_ID_Type_1,Item_ID_Type_2
0,9.300,0.016047,249.8092,9,24,3735.1380,True,False,False,False,...,True,False,False,False,True,False,False,False,True,False
1,5.920,0.019278,48.2692,3,14,443.4228,False,True,False,False,...,False,False,True,False,False,True,False,True,False,False
2,17.500,0.016760,141.6180,9,24,2097.2700,True,False,False,False,...,True,False,False,False,True,False,False,False,True,False
3,19.200,0.066132,182.0950,0,25,732.3800,False,True,False,False,...,False,False,True,True,False,False,False,False,True,False
4,8.930,0.066132,53.8614,1,36,994.7052,False,False,True,True,...,False,False,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,1,36,2778.3834,True,False,False,True,...,False,False,True,False,True,False,False,False,True,False
8519,8.380,0.046982,108.1570,7,21,549.2850,False,True,False,False,...,False,True,False,False,True,False,False,False,True,False
8520,10.600,0.035186,85.1224,6,19,1193.1136,False,False,True,False,...,False,True,False,False,True,False,False,False,False,True
8521,7.210,0.145221,103.1332,3,14,1845.5976,False,True,False,False,...,False,False,True,False,False,True,False,False,True,False


In [43]:
predicted = lasso.predict(mod_test)


In [44]:
predicted

array([1818.18621348, 1577.51150946, 1890.63159909, ..., 1927.03374175,
       3559.87357666, 1375.39414464])

In [46]:
df = pd.DataFrame(predicted)


In [47]:
csv_file_path = "result.csv"
df.to_csv(csv_file_path, index=False)
