In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
train_df = pd.read_csv("../data/Train.csv")  # Change filename as needed

# Overview of the dataset
print(train_df.info())  # Check data types and missing values
print(train_df.describe())  # Summary statistics
print(train_df.head())  # Preview first few rows

# Check for missing values
print(train_df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
None
       Item_Weight  Item_Visibility     

In [8]:
# Fill missing numerical values with median
num_cols = train_df.select_dtypes(include=np.number).columns
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].median())

# Fill missing categorical values with mode
cat_cols = train_df.select_dtypes(include="object").columns
train_df[cat_cols] = train_df[cat_cols].fillna(train_df[cat_cols].mode().iloc[0])

# Confirm no missing values left
print(train_df.isnull().sum())


Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64


In [9]:
train_df['Price_per_Unit_Weight'] = train_df['Item_MRP'] / train_df['Item_Weight']
train_df['Item_Category'] = train_df['Item_Identifier'].str[:2]
train_df['Outlet_Age'] = 2025 - train_df['Outlet_Establishment_Year']
train_df['Item_Visibility_Bin'] = pd.qcut(train_df['Item_Visibility'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])
train_df['Combined_Item_Type'] = train_df['Item_Type'] + '_' + train_df['Item_Fat_Content']
train_df['Outlet_Location_Size'] = train_df['Outlet_Location_Type'] + '_' + train_df['Outlet_Size']


In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])


In [16]:
train_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,Price_per_Unit_Weight,Outlet_Age,Item_Fat_Content_1,Item_Fat_Content_2,...,Combined_Item_Type_Starchy Foods_LF,Combined_Item_Type_Starchy Foods_Low Fat,Combined_Item_Type_Starchy Foods_Regular,Combined_Item_Type_Starchy Foods_low fat,Combined_Item_Type_Starchy Foods_reg,Outlet_Location_Size_Tier 1_Small,Outlet_Location_Size_Tier 2_Medium,Outlet_Location_Size_Tier 2_Small,Outlet_Location_Size_Tier 3_High,Outlet_Location_Size_Tier 3_Medium
0,156,-0.831187,-0.970732,1.747454,0.139541,0.910601,26.861204,26,1,0,...,0,0,0,0,0,0,0,0,0,0
1,8,-1.630810,-0.908111,-1.489023,1.334103,-1.018440,8.153581,16,0,1,...,0,0,0,0,0,0,0,0,0,1
2,662,1.108727,-0.956917,0.010040,0.139541,-0.049238,8.092457,26,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1121,1.510904,-1.281758,0.660050,0.020085,-0.849103,9.484115,27,0,1,...,0,0,0,0,0,0,0,0,0,1
4,1297,-0.918719,-1.281758,-1.399220,-1.293934,-0.695373,6.031512,38,1,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,370,-1.407246,-0.181193,1.180783,-1.293934,0.349915,31.248623,38,1,0,...,0,0,0,0,0,0,0,0,1,0
8519,897,-1.048835,-0.371154,-0.527301,0.497909,-0.956402,12.906563,23,0,1,...,0,0,0,0,0,0,1,0,0,0
8520,1357,-0.523639,-0.599784,-0.897208,0.736822,-0.579100,8.030415,21,1,0,...,0,0,0,0,0,0,0,1,0,0
8521,681,-1.325628,1.532880,-0.607977,1.334103,-0.196725,14.304189,16,0,1,...,0,0,0,0,0,0,0,0,0,1


In [15]:
train_df = pd.get_dummies(train_df, columns=['Item_Fat_Content', 'Item_Category', 'Item_Type', 'Outlet_Identifier',
                                             'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type',
                                             'Item_Visibility_Bin', 'Combined_Item_Type', 'Outlet_Location_Size'],
                          drop_first=True)


In [18]:
X = train_df.drop('Item_Outlet_Sales', axis=1)
y = train_df['Item_Outlet_Sales']


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)


In [20]:
def evaluate_performance(y_true, y_pred, dataset_name):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"✅ Model Performance on {dataset_name} Set:")
    print(f"MAE: {mae:.4f}")
    print(f"MSE: {mse:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}\n")

# Evaluate
evaluate_performance(y_train, y_train_pred, "Training")
evaluate_performance(y_val, y_val_pred, "Validation")


✅ Model Performance on Training Set:
MAE: 0.1757
MSE: 0.0639
RMSE: 0.2528
R² Score: 0.9371

✅ Model Performance on Validation Set:
MAE: 0.4490
MSE: 0.4178
RMSE: 0.6464
R² Score: 0.5524



In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Define the model
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees
    'max_depth': [10, 20, 30, 40, None],  # Tree depth
    'min_samples_split': [2, 5, 10],  # Minimum samples to split
    'min_samples_leaf': [1, 2, 4],  # Minimum samples in leaf
    'max_features': ['auto', 'sqrt', 'log2']  # Number of features per split
}

# RandomizedSearchCV setup
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,  # Number of random settings to try
    scoring='neg_root_mean_squared_error',  # Optimize for lowest RMSE
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all CPU cores
)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final model with best parameters
best_rf = RandomForestRegressor(**best_params, random_state=42)
best_rf.fit(X_train, y_train)

# Predict on validation set
y_pred = best_rf.predict(X_validate)

# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_validate, y_pred)
mse = mean_squared_error(y_validate, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_validate, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'auto', 'max_depth': 10}


NameError: name 'X_validate' is not defined

[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   3.3s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  12.8s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   8.1s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   8.2s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=500; total time=   5.5s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   3.7s
[CV] END max_depth=None, max_fea

[CV] END max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=300; total time=   3.4s
[CV] END max_depth=40, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   1.4s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.0s
[CV] END max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=200; total time=   2.1s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  17.2s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=500; total time=  12.7s
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END max_depth=10, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=500; total time=  37.2s
[CV] END max_depth=None, max_fe

In [23]:
import seaborn as sns
import matplotlib.pyplot as plt

encoded_features = ["Outlet_Age", "Outlet_Size", "Outlet_Location_Type", "Outlet_Type", "Outlet_Identifier_Encoded"]

correlation_matrix = train_df[encoded_features + ["Item_Outlet_Sales"]].corr()
plt.figure(figsize=(10,6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()


KeyError: "['Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Outlet_Identifier_Encoded'] not in index"