## Model Training


#### 1.1 Import Data and Required Packages


##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [24]:
! pip install scikit-learn



In [65]:
# Basic Import
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt  
import seaborn as sns 
# Modelling
from sklearn.preprocessing import LabelEncoder 
from sklearn.metrics import mean_squared_error, r2_score  
from sklearn.neighbors import KNeighborsRegressor  
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
import warnings
import os
import sys
# Add the src folder to the system path
sys.path.append(os.path.abspath('../src'))

from logger import logging             # type:ignore

# Log the start of the notebook
logging.info('importing necessary libraries for model training')

In [66]:
#importing data 
data=pd.read_csv('../data/train.csv')
logging.info(f'reading data {data.head()}')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [67]:
df=data.copy()

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [69]:
df.shape

(8523, 12)

In [70]:
#checking null values
df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [71]:
#handling missing value
def handle_missing_values(df):
    # Impute Item_Weight with median
    df['Item_Weight'] = df['Item_Weight'].fillna(df['Item_Weight'].median())
    
    # Impute Outlet_Size with the first mode safely
    if not df['Outlet_Size'].mode().empty:  # Check if mode is available
        df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Size'].value_counts().index[1])
    else:
        logging.warning("Outlet_Size column has no mode value.")
    
    logging.info("Missing values handled")
    return df
handle_missing_values(df)
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [72]:
# logging.info(f'null values after handling missing value \n {df.isna().sum()}')
df.isna().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [73]:
#splitting target feature and independent features
y=df['Item_Outlet_Sales']
df=df.drop(columns='Item_Outlet_Sales')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [74]:
#encoding new extracted features
def create_features(df):
    # Example: Age of outlet
    df['Outlet_Age'] = 2024 - df['Outlet_Establishment_Year']
    
    # # Example: Item_Category from Item_Identifier
    # df['Item_Category'] = df['Item_Identifier'].str[:2]
    
    df.drop(columns=['Item_Identifier','Outlet_Establishment_Year'],axis=1,inplace=True)
    
    logging.info("New features created")
    return df

create_features(df)


Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age
0,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,25
1,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,15
2,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,Medium,Tier 1,Supermarket Type1,25
3,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,Small,Tier 3,Grocery Store,26
4,8.930,Low Fat,0.000000,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,37
...,...,...,...,...,...,...,...,...,...,...
8518,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,High,Tier 3,Supermarket Type1,37
8519,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,Small,Tier 2,Supermarket Type1,22
8520,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,Small,Tier 2,Supermarket Type1,20
8521,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,Medium,Tier 3,Supermarket Type2,15


In [75]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age
0,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,25
1,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,15
2,17.5,Low Fat,0.01676,Meat,141.618,OUT049,Medium,Tier 1,Supermarket Type1,25
3,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,Small,Tier 3,Grocery Store,26
4,8.93,Low Fat,0.0,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,37


In [76]:
def encoding_techniques(df):
    # Create Column Transformer with 3 types of transformers
    num_features = df.select_dtypes(exclude="object").columns
    cat_features = df.select_dtypes(include="object").columns

    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from sklearn.compose import ColumnTransformer

    numeric_transformer = StandardScaler()
    oh_transformer = OneHotEncoder()
    logging.info(f'encoding categorical columns \n {cat_features} ')
    logging.info(f'standardizing numerical columns \n {num_features} ')

    preprocessor = ColumnTransformer(
        [
            ("OneHotEncoder", oh_transformer, cat_features),
            ("StandardScaler", numeric_transformer, num_features),        
     ]
    )
    return preprocessor
scaler=encoding_techniques(df)
scaler

In [77]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
def build_preprocessing_pipeline(df):
    num_features = df.select_dtypes(exclude="object").columns
    cat_features = df.select_dtypes(include="object").columns

    numeric_transformer = StandardScaler()
    oh_transformer = OneHotEncoder()

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, num_features),
            ('cat', oh_transformer, cat_features)
        ])
    
    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
    # logging.info("Preprocessing pipeline created")
    
    return pipeline

In [78]:
scaler=build_preprocessing_pipeline(df)
scaler


In [79]:
df_array= scaler.fit_transform(df).toarray()
df_one=pd.DataFrame(df_array)

In [80]:
df_one

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,-0.831187,-0.970732,1.747454,-0.139541,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.630810,-0.908111,-1.489023,-1.334103,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.108727,-0.956917,0.010040,-0.139541,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.510904,-1.281758,0.660050,-0.020085,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,-0.918719,-1.281758,-1.399220,1.293934,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,-1.407246,-0.181193,1.180783,1.293934,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8519,-1.048835,-0.371154,-0.527301,-0.497909,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8520,-0.523639,-0.599784,-0.897208,-0.736822,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8521,-1.325628,1.532880,-0.607977,-1.334103,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [81]:
def split(X,y):
    # separate dataset into train and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=25)
    logging.info('splitting data into train_test_split')
    return X_train,X_test,y_train,y_test

X_train,X_test,y_train,y_test=split(df_one,y)
    

In [82]:
X_train.shape

(6818, 45)

#### Create an Evaluate Function to give all metrics after model Training

In [83]:
#model evalution 
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [84]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model
    logging.info(f'building and fitting {list(models.keys())[i]} model')

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    logging.info(f'Predicting {list(models.keys())[i]} model')
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    logging.info(f'defining evalution metrics function for {list(models.keys())[i]}')

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')


Linear Regression
Model performance for Training set
- Root Mean Squared Error: 1121.5691
- Mean Absolute Error: 830.8979
- R2 Score: 0.5651
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1151.2738
- Mean Absolute Error: 849.6159
- R2 Score: 0.5566


Lasso
Model performance for Training set
- Root Mean Squared Error: 1121.8490
- Mean Absolute Error: 830.7509
- R2 Score: 0.5648
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1149.2124
- Mean Absolute Error: 847.7349
- R2 Score: 0.5581


Ridge
Model performance for Training set
- Root Mean Squared Error: 1121.5694
- Mean Absolute Error: 830.8860
- R2 Score: 0.5651
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1151.2407
- Mean Absolute Error: 849.5804
- R2 Score: 0.5566


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 986.9321
- Mean Absolute Error: 700.8947
- R2 Sco

### Results

In [85]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
5,Random Forest Regressor,0.568349
1,Lasso,0.55815
2,Ridge,0.556589
0,Linear Regression,0.556563
3,K-Neighbors Regressor,0.489928
6,AdaBoost Regressor,0.458152
4,Decision Tree,0.231818


In [86]:
#importing data 
test_df=pd.read_csv('../data/test.csv')
logging.info('Reading test_df which is given by company for furthe rprediction')
test_df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.3,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.6,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.034,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [87]:
test_df.isna().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [88]:
handle_missing_values(test_df)
create_features(test_df)
logging.info('Handling missing values for test_df')
# test_df_one = pd.DataFrame(encoding_techniques(test_df).transform(test_df).toarray())
# test_df_one

In [89]:
test_df.isna().sum()
logging.info(f'count of nulls after handling missing values \n {test_df.isna().sum()}')

In [90]:
test_df.isna().sum()

Item_Weight             0
Item_Fat_Content        0
Item_Visibility         0
Item_Type               0
Item_MRP                0
Outlet_Identifier       0
Outlet_Size             0
Outlet_Location_Type    0
Outlet_Type             0
Outlet_Age              0
dtype: int64

In [91]:
test_df_array = scaler.transform(test_df).toarray()
test_df_one=pd.DataFrame(test_df_array)
logging.info(f'standardizing and encoding data \n {test_df_one}')

In [92]:
test_df_one

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1.877595,-1.135138,-0.532035,-0.139541,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-1.067761,-0.536960,-0.861920,-1.095190,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.422660,0.648183,1.618094,-0.020085,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,-1.300788,-0.983503,0.225484,-1.095190,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.074147,1.016910,1.497272,1.532846,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5676,-0.547297,-1.020172,0.005181,0.099372,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
5677,-1.233364,1.489663,0.452086,-1.334103,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
5678,-0.665584,0.143358,-0.357287,-0.497909,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5679,0.588262,-1.281758,1.182389,-1.095190,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [93]:
#cross checking with a single row and columns weather our encoding codes or working or not
first_row=test_df.values[0]
columns=test_df.columns
dictionary = dict(zip(columns, first_row))
new_data=pd.DataFrame([dictionary])
new_data

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age
0,20.75,Low Fat,0.007565,Snack Foods,107.8622,OUT049,Medium,Tier 1,Supermarket Type1,25


In [94]:
#data after just transforming data into encoding model
pd.DataFrame(scaler.transform(new_data).toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1.877595,-1.135138,-0.532035,-0.139541,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
