In [50]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import category_encoders as ce

In [51]:
df= pd.read_csv(r'C:\Users\varsh\Documents\ineuron 1\Train.csv')

In [52]:
# Normalize the 'Item_Fat_Content' column
df['Item_Fat_Content'] = df['Item_Fat_Content'].str.lower()  # Convert to lowercase

# Replace inconsistent values
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'lf': 'low fat', 'reg': 'regular'})

# Check the unique values again to confirm the cleaning
print(df['Item_Fat_Content'].unique())

['low fat' 'regular']


In [53]:
# Fill missing values in 'Outlet_Size' with a placeholder value, e.g., 'Unknown'
df['Outlet_Size'].fillna('Unknown', inplace=True)

# Check if missing values are handled
print(df['Outlet_Size'].isnull().sum())


0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Outlet_Size'].fillna('Unknown', inplace=True)


In [54]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,low fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,low fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,Unknown,Tier 3,Grocery Store,732.38
4,NCD19,8.93,low fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [55]:
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Assuming you have the label_encoders and scaler objects already defined
# Save Label Encoders and StandardScaler to a pickle file

# Initialize your label encoders and scaler (if not already done)
label_encoders = {}
scaler = StandardScaler()
knn_imputer = KNNImputer(n_neighbors=5)

# Columns to apply label encoding
columns_to_encode = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 
                     'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# Apply Label Encoding and save encoders
for col in columns_to_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])  # Transform the column in place
    label_encoders[col] = le  # Save the encoder for future use

# Apply KNN Imputation and StandardScaler (if not already done)
columns_to_impute = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
df[columns_to_impute] = knn_imputer.fit_transform(df[columns_to_impute])

columns_to_scale = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# Save encoders and scalers to a pickle file
with open('encoders_scalers.pkl', 'wb') as file:
    pickle.dump({
        'label_encoders': label_encoders,
        'scaler': scaler,
        'knn_imputer': knn_imputer
    }, file)

print("Encoders and Scalers saved to pickle file.")


Encoders and Scalers saved to pickle file.


In [60]:
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Outlet_Sales_log
0,156,-0.82489,0,-0.970732,4,1.747454,9,0.139541,1,0,1,3735.138,8.225808
1,8,-1.608127,1,-0.908111,14,-1.489023,3,1.334103,1,2,2,443.4228,6.096776
2,662,1.075271,0,-0.956917,10,0.01004,9,0.139541,1,0,1,2097.27,7.648868
3,1121,1.469207,1,-1.281758,6,0.66005,0,0.020085,3,2,0,732.38,6.597664
4,1297,-0.910629,0,-1.281758,9,-1.39922,1,-1.293934,0,2,1,994.7052,6.903451


In [56]:
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

# Step 1: Log Transform the target variable (assuming 'Item_Outlet_Sales' is your target)
df['Item_Outlet_Sales_log'] = np.log1p(df['Item_Outlet_Sales'])  # log(1 + x) to avoid issues with 0 values

# Step 2: Prepare data for XGBoost (assuming 'Item_Outlet_Sales_log' as the target)
X = df.drop(['Item_Outlet_Sales', 'Item_Outlet_Sales_log'], axis=1)  # Features
y = df['Item_Outlet_Sales_log']  # Target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror')

# Step 4: Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3]
}

random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best parameters and model
print(f"Best Parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_

# Step 5: Evaluate the model
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best Parameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.3, 'colsample_bytree': 0.9}
Mean Squared Error: 0.2684724001052167


In [57]:
from sklearn.metrics import mean_squared_error, r2_score

# Step 5: Evaluate the model
y_pred = best_model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate R² (R-squared)
r2 = r2_score(y_test, y_pred)
print(f"R² (R-squared): {r2}")


Mean Squared Error: 0.2684724001052167
R² (R-squared): 0.7446388849260286


In [58]:
import pickle

# Step 1: Save the trained model to a pickle file
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print("Model saved to model.pkl")


Model saved to model.pkl


In [59]:
df.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Outlet_Sales_log
0,156,-0.82489,0,-0.970732,4,1.747454,9,0.139541,1,0,1,3735.138,8.225808
1,8,-1.608127,1,-0.908111,14,-1.489023,3,1.334103,1,2,2,443.4228,6.096776
2,662,1.075271,0,-0.956917,10,0.01004,9,0.139541,1,0,1,2097.27,7.648868
3,1121,1.469207,1,-1.281758,6,0.66005,0,0.020085,3,2,0,732.38,6.597664
4,1297,-0.910629,0,-1.281758,9,-1.39922,1,-1.293934,0,2,1,994.7052,6.903451
5,758,-0.571149,1,-1.281758,0,-1.438734,3,1.334103,1,2,2,556.6088,6.323658
6,696,0.183123,1,-1.034813,13,-1.338238,1,-1.293934,0,2,1,343.5528,5.842247
7,738,0.96636,0,1.188838,13,-0.533641,5,-1.532846,1,2,3,4022.7636,8.299973
8,440,0.774026,1,-0.958331,5,-0.706908,7,0.497909,3,1,1,1076.5986,6.98249
9,990,1.469207,1,0.548845,5,0.752008,2,1.09519,3,1,1,4710.535,8.457769
