In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

: 

Data Collection & Analysis

In [None]:
#Loading the dataset from csv file to Pandas Dataframe

big_mart_data = pd.read_csv('C:\Python Project_2\Data\Train_UWu5bXk.csv')
big_mart_data.head()

In [None]:
# Number of data points and number of featurers
big_mart_data.shape

In [None]:
# Getting some information about the dataset
big_mart_data.info()

Categorical Features:
- Item_Identifier 
- Item_Fat_Content 
- Item_Type   
- Outlet_Identifier 
- Outlet_Size
- Outlet_Location_Type
- Outlet_Type

In [None]:
# Checking for missing values
big_mart_data.isnull().sum()

Handling Missing Values

Mean-> average value,
Mode-> most repeated value

In [None]:
# Mean value of 'item_wight' column
big_mart_data['Item_Weight'].mean()

In [None]:
# Filling the missing values in 'Item_Weight' column with 'Mean' value
big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean(), inplace = True)

In [None]:
# Checking for missing values
big_mart_data.isnull().sum()

Replacing the missing values in 'Outlet_Size' with mode

In [None]:
mode_of_outlet_size = big_mart_data.pivot_table(
    values='Outlet_Size',
    columns='Outlet_Type',  
    aggfunc=lambda x: x.mode()[0] 
)

In [None]:
mode_outlet_size = big_mart_data['Outlet_Size'].mode()[0]
big_mart_data['Outlet_Size'].fillna(mode_outlet_size, inplace=True)

In [None]:
print(mode_of_outlet_size)

In [None]:
missing_values = big_mart_data['Outlet_Size'].isnull()

In [None]:
print(missing_values)

In [None]:
big_mart_data.isnull().sum()

Data Analysis

In [None]:
# Statistical Measures about the data
big_mart_data.describe()

Numberical Features

In [None]:
sns.set()

In [None]:
# Item_Weight diestribution
plt.figure(figsize = (6,6))
sns.displot(big_mart_data['Item_Weight'])
plt.show()

In [None]:
# Item_Visibility diestribution
plt.figure(figsize = (6,6))
sns.displot(big_mart_data['Item_Visibility'])
plt.show()

In [None]:
# Item_MRP diestribution
plt.figure(figsize = (6,6))
sns.displot(big_mart_data['Item_MRP'])
plt.show()

In [None]:
# Item_Outlet_Sales diestribution
plt.figure(figsize = (6,6))
sns.displot(big_mart_data['Item_Outlet_Sales'])
plt.show()

In [None]:
# Outlet_Establishment_Year
plt.figure(figsize = (6,6))
sns.countplot(x = 'Outlet_Establishment_Year',data = big_mart_data)
plt.show()

Categorical Features

In [None]:
# Item_Fat_Content 
plt.figure(figsize = (12,6))
sns.countplot(x = 'Item_Fat_Content',data = big_mart_data)
plt.show()

In [None]:
# Item_Type 
plt.figure(figsize = (20,6))
sns.countplot(x = 'Item_Type',data = big_mart_data)
plt.show()

In [None]:
# Outlet_Size column
plt.Figure(figsize = (6,6)) 
sns.countplot(x = 'Outlet_Size',data = big_mart_data)
plt.title('Outlet Size')
plt.show()

Data Pre-Processing


In [None]:
big_mart_data['Item_Fat_Content'].value_counts()

In [None]:
big_mart_data.replace({'Item_Fat_Content': {'low fat': 'Low Fat','LF': 'Low Fat', 'reg': 'Regular'}}, inplace=True) 

In [None]:
big_mart_data['Item_Fat_Content'].value_counts()

Label Encoding

In [None]:
encoder  = LabelEncoder()

In [None]:
big_mart_data['Item_Identifier'] = encoder.fit_transform(big_mart_data['Item_Identifier'])
big_mart_data['Item_Fat_Content'] = encoder.fit_transform(big_mart_data['Item_Fat_Content'])
big_mart_data['Item_Type'] = encoder.fit_transform(big_mart_data['Item_Type'])
big_mart_data['Outlet_Identifier'] = encoder.fit_transform(big_mart_data['Outlet_Identifier'])
big_mart_data['Outlet_Size'] = encoder.fit_transform(big_mart_data['Outlet_Size'])
big_mart_data['Outlet_Location_Type'] = encoder.fit_transform(big_mart_data['Outlet_Location_Type'])
big_mart_data['Outlet_Type'] = encoder.fit_transform(big_mart_data['Outlet_Type'])


In [None]:
big_mart_data.head()

Spliting Features and Target

In [None]:
X = big_mart_data.drop(columns = ['Item_Outlet_Sales'], axis = 1)
Y = big_mart_data['Item_Outlet_Sales']

In [None]:
print(X)

In [None]:
print(Y)

Splitting The Data Into Traning and Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

Machine Learning Model Traning

In [None]:
regressor = XGBRegressor()

In [None]:
regressor.fit(X_train, Y_train)

Evaluation

In [None]:
# Prediction on training data
training_data_prediction = regressor.predict(X_train)

In [None]:
# R squared value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print("R squared value for training data: ", r2_train)

In [None]:
# Prediction on test data
test_data_prediction = regressor.predict(X_test)

In [None]:
# R squared value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print("R squared value for test data: ", r2_test)

In [None]:
# Get a single row from the test set
sample_input = X_test.iloc[0]

# Reshape for prediction (1 sample, n features)
sample_input_reshaped = sample_input.values.reshape(1, -1)

# Predict using trained model
predicted_sales = regressor.predict(sample_input_reshaped)

print("Sample Input (from test set):")
print(sample_input)
print("\nPredicted Sales:", predicted_sales[0])


In [None]:
# Manually define a new sample input
manual_input = np.array([[157,      # Item_Identifier 
                          12.5,     # Item_Weight
                          0,        # Item_Fat_Content 
                          0.05,     # Item_Visibility
                          10,       # Item_Type 
                          200.0,    # Item_MRP
                          2,        # Outlet_Identifier 
                          1999,     # Outlet_Establishment_Year
                          1,        # Outlet_Size 
                          1,        # Outlet_Location_Type 
                          0]])      # Outlet_Type 

# Predict using the trained model
predicted_sales_manual = regressor.predict(manual_input)

print("Predicted Sales for Manual Input:", predicted_sales_manual[0])
