In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
import pickle

# Load the dataset
big_mart_data = pd.read_csv('Train.csv')

# Data preprocessing
# Fill missing values in 'Item_Weight' with the mean
big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean(), inplace=True)

# Fill missing values in 'Outlet_Size' based on the mode of 'Outlet_Type'
mode_of_Outlet_size = big_mart_data.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=lambda x: x.mode()[0])
miss_values = big_mart_data['Outlet_Size'].isnull()
big_mart_data.loc[miss_values, 'Outlet_Size'] = big_mart_data.loc[miss_values, 'Outlet_Type'].apply(lambda x: mode_of_Outlet_size[x])

# Standardize 'Item_Fat_Content' values
big_mart_data['Item_Fat_Content'].replace({'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'}, inplace=True)

# Encode categorical variables
encoder = LabelEncoder()
for col in ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']:
    big_mart_data[col] = encoder.fit_transform(big_mart_data[col])

# Splitting the dataset into features and target variable
X = big_mart_data.drop(columns='Item_Outlet_Sales', axis=1)
Y = big_mart_data['Item_Outlet_Sales']

# Check the shape of X and Y
print("Shape of features (X):", X.shape)
print("Shape of target (Y):", Y.shape)

# Splitting the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Training the 
regressor = XGBRegressor()
regressor.fit(X_train, Y_train)

# Save the model as a pickle file
model_filename = 'model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(regressor, file)

print(f"Model saved as {model_filename}")

# Prediction and evaluation
training_data_prediction = regressor.predict(X_train)
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared value on training data =', r2_train)

test_data_prediction = regressor.predict(X_test)
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R Squared value on test data =', r2_test)

# Check the feature names used in the model
print("Feature names used in the model:", X.columns.tolist())

# Ensure that the test data has the same features
print("Feature names in test data:", X_test.columns.tolist())

Shape of features (X): (8523, 11)
Shape of target (Y): (8523,)
Model saved as model.pkl
R Squared value on training data = 0.8762174618111388
R Squared value on test data = 0.5017253991620692
Feature names used in the model: ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
Feature names in test data: ['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
