In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost 

from lightgbm import LGBMRegressor


In [2]:
# Load csv file
df = pd.read_csv('Train.csv')
Test  = pd.read_csv('Test.csv')

In [3]:
df.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27


In [4]:
Test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


### Find out the null values

In [5]:
df.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
Test.isnull().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

### Removing null values from the Item_Weight

In [7]:
#Determine the average weight per item:
item_avg_weight = df.pivot_table(values='Item_Weight', index='Item_Identifier') 

item_avg_weight_test = Test.pivot_table(values='Item_Weight', index='Item_Identifier') 

In [8]:
item_avg_weight.reset_index(inplace = True)

item_avg_weight_test.reset_index(inplace = True)

In [9]:
merged_df = df.merge(item_avg_weight, on='Item_Identifier', how='left' , suffixes = ('_x',''))
merged_df.fillna(item_avg_weight, inplace=True)

merged_df_test = Test.merge(item_avg_weight_test, on='Item_Identifier', how='left' , suffixes = ('_x',''))
merged_df_test.fillna(item_avg_weight, inplace=True)

In [10]:
merged_df.drop(columns = 'Item_Weight_x' , inplace = True)
df = merged_df

merged_df_test.drop(columns = 'Item_Weight_x' , inplace = True)
Test = merged_df_test

In [11]:
# To fill the remaining nan values we use the fillna with median strategy
df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].median())

Test['Item_Weight'] = Test['Item_Weight'].fillna(Test['Item_Weight'].median())

### Removing the null values from the Outlet_Size

In [12]:
# filling the missing values in "Outlet_Size" column with Mode
#Here we take Outlet_Size column & Outlet_Type column since they are correlated
mode_of_Outlet_size = df.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
mode_of_Outlet_size

mode_of_Outlet_size_t = Test.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
mode_of_Outlet_size_t

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [13]:
miss_values = df['Outlet_Size'].isnull()
df.loc[miss_values, 'Outlet_Size'] = df.loc[miss_values,'Outlet_Type'].apply(lambda x: mode_of_Outlet_size[x])

miss_values = Test['Outlet_Size'].isnull()
Test.loc[miss_values, 'Outlet_Size'] = Test.loc[miss_values,'Outlet_Type'].apply(lambda x: mode_of_Outlet_size_t[x])

In [14]:
# checking for missing values
df.isnull().sum()

Item_Identifier              0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
Item_Weight                  0
dtype: int64

In [15]:
Test.isnull().sum()

Item_Identifier              0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Weight                  0
dtype: int64

In [16]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF' : 'Low Fat' , 'reg' : 'Regular' , 'low fat' : 'Low Fat'})
df['Item_Fat_Content'].value_counts()

Test['Item_Fat_Content'] = Test['Item_Fat_Content'].replace({'LF' : 'Low Fat' , 'reg' : 'Regular' , 'low fat' : 'Low Fat'})
Test['Item_Fat_Content'].value_counts()

Low Fat    3668
Regular    2013
Name: Item_Fat_Content, dtype: int64

In [17]:
df['Outlet_Type'].value_counts()

Supermarket Type1    5577
Grocery Store        1083
Supermarket Type3     935
Supermarket Type2     928
Name: Outlet_Type, dtype: int64

In [18]:
df.groupby('Outlet_Type')['Item_Outlet_Sales'].sum()

Outlet_Type
Grocery Store        3.680343e+05
Supermarket Type1    1.291734e+07
Supermarket Type2    1.851823e+06
Supermarket Type3    3.453926e+06
Name: Item_Outlet_Sales, dtype: float64

In [19]:
x = df.drop(columns = ['Item_Outlet_Sales','Item_Fat_Content'])
x.head(3)

Unnamed: 0,Item_Identifier,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Weight
0,FDA15,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,9.3
1,DRC01,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,5.92
2,FDN15,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,17.5


In [20]:
y = df['Item_Outlet_Sales']
#y = np.log(1 + df['Item_Outlet_Sales'])
y.head(3)

0    3735.1380
1     443.4228
2    2097.2700
Name: Item_Outlet_Sales, dtype: float64

In [21]:
df['Outlet_Establishment_Year'] = 2013 - df['Outlet_Establishment_Year']
df['Outlet_Establishment_Year']

Test['Outlet_Establishment_Year'] = 2013 - Test['Outlet_Establishment_Year']
Test['Outlet_Establishment_Year']

0       14
1        6
2       15
3        6
4       28
        ..
5676    16
5677     4
5678    11
5679     6
5680    11
Name: Outlet_Establishment_Year, Length: 5681, dtype: int64

In [22]:
ohe = OneHotEncoder()
ohe.fit(x[['Item_Identifier','Item_Type','Outlet_Identifier']])

In [23]:
column_trans = make_column_transformer((OneHotEncoder(categories=ohe.categories_),['Item_Identifier','Item_Type','Outlet_Identifier']),
                                       (OrdinalEncoder(categories = [['High','Medium','Small']]),['Outlet_Size']),
                                       (OrdinalEncoder(categories = [['Tier 1','Tier 2','Tier 3']]), ['Outlet_Location_Type']),
                                       (OrdinalEncoder(categories = [['Supermarket Type1','Supermarket Type2','Supermarket Type3','Grocery Store']]),['Outlet_Type']),
                                      remainder = 'passthrough')

In [24]:
df

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Weight
0,FDA15,Low Fat,0.016047,Dairy,249.8092,OUT049,14,Medium,Tier 1,Supermarket Type1,3735.1380,9.300
1,DRC01,Regular,0.019278,Soft Drinks,48.2692,OUT018,4,Medium,Tier 3,Supermarket Type2,443.4228,5.920
2,FDN15,Low Fat,0.016760,Meat,141.6180,OUT049,14,Medium,Tier 1,Supermarket Type1,2097.2700,17.500
3,FDX07,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,15,Small,Tier 3,Grocery Store,732.3800,19.200
4,NCD19,Low Fat,0.000000,Household,53.8614,OUT013,26,High,Tier 3,Supermarket Type1,994.7052,8.930
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,Low Fat,0.056783,Snack Foods,214.5218,OUT013,26,High,Tier 3,Supermarket Type1,2778.3834,6.865
8519,FDS36,Regular,0.046982,Baking Goods,108.1570,OUT045,11,Small,Tier 2,Supermarket Type1,549.2850,8.380
8520,NCJ29,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,9,Small,Tier 2,Supermarket Type1,1193.1136,10.600
8521,FDN46,Regular,0.145221,Snack Foods,103.1332,OUT018,4,Medium,Tier 3,Supermarket Type2,1845.5976,7.210


In [25]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size = 0.2 )

In [26]:
lr = LinearRegression()

In [27]:
pipe = make_pipeline(column_trans , lr)

In [28]:
pipe.fit(x_train , y_train)

In [29]:
y_pred = pipe.predict(x_test)

In [30]:
r2_score(y_test , y_pred)

0.38693635326577247

In [31]:
import numpy as np
from sklearn.metrics import mean_squared_error

def regression_predictions(models, x_train, x_test, y_train, y_test):
    predictions = {}
    for model in models:
        model_name = model.__class__.__name__
        # fit the model to the training data
        pipe = make_pipeline(column_trans , model)
        pipe.fit(x_train, y_train)
        
        # make predictions on training and test set
        y_test_pred = pipe.predict(x_test)
        
        # calculate the mean squared error
        test_mse = mean_squared_error(y_test, y_test_pred)
        rmse = np.sqrt(test_mse)
        r2_score_ = r2_score(y_test, y_test_pred)
        predictions[model_name] = { 'test_mse': test_mse , 'r2_score' : r2_score_ , 'test_rmse':rmse}
    return predictions

In [32]:
models = [LinearRegression(),
         BaggingRegressor(estimator = LinearRegression()),
         GradientBoostingRegressor(loss = 'squared_error'),
         xgboost.XGBRegressor(objective='reg:squarederror'),
         LGBMRegressor()]
        
regression_predictions(models, x_train, x_test, y_train, y_test)

{'LinearRegression': {'test_mse': 1751062.8063041375,
  'r2_score': 0.38693635326577247,
  'test_rmse': 1323.2772975851046},
 'BaggingRegressor': {'test_mse': 1711179.6961892557,
  'r2_score': 0.4008998072561746,
  'test_rmse': 1308.1206734048872},
 'GradientBoostingRegressor': {'test_mse': 1219211.2700967335,
  'r2_score': 0.5731426053400226,
  'test_rmse': 1104.1790027421882},
 'XGBRegressor': {'test_mse': 1212456.6434087842,
  'r2_score': 0.5755074640159845,
  'test_rmse': 1101.1160898873397},
 'LGBMRegressor': {'test_mse': 1295334.1305412375,
  'r2_score': 0.5464912720720584,
  'test_rmse': 1138.1274667370249}}

In [33]:
XGB = xgboost.XGBRegressor(objective='reg:squarederror')

In [34]:
pipe = make_pipeline(column_trans , XGB)

In [35]:
pipe.fit(x_train , y_train)

In [36]:
y_pred = pipe.predict(x_test)

In [37]:
test_mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(test_mse)
r2_score_ = r2_score(y_test, y_pred)

In [38]:
predictions = { 'test_mse': test_mse , 'r2_score' : r2_score_ , 'test_rmse':rmse}
predictions

{'test_mse': 1212456.6434087842,
 'r2_score': 0.5755074640159845,
 'test_rmse': 1101.1160898873397}

In [39]:
Test.shape

(5681, 11)

In [40]:
adjusted_r2 = 1 - (1-r2_score_)*(len(Test)-1)/(len(Test)-11-1)
adjusted_r2

0.5746837882538001

In [41]:
x_test.shape

(1705, 10)

In [42]:
y_pred = pipe.predict(Test)

In [43]:
prediction = y_pred

In [44]:
prediction

array([1645.9194, 1372.8451,  597.5429, ..., 1854.1125, 3608.6353,
       1357.4684], dtype=float32)

In [45]:
import pandas as pd
df = pd.DataFrame(prediction , columns = ['Item_Outlet_Sales'])
df

Unnamed: 0,Item_Outlet_Sales
0,1645.919434
1,1372.845093
2,597.542908
3,2474.121826
4,5857.973145
...,...
5676,2066.043457
5677,2444.923340
5678,1854.112549
5679,3608.635254


In [46]:
new = Test[['Item_Identifier','Outlet_Identifier']]
new = new.join(df)

In [47]:
new.to_csv('Big_mart_price_prediction.csv' , index = False)

In [48]:
df[df['Item_Outlet_Sales'] < 0]

Unnamed: 0,Item_Outlet_Sales
104,-255.420486
