In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [2]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

In [4]:
test

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Category_Food,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+
0,20.750,0.007565,107.8622,14,Low,215.7244,False,False,False,False,...,True,False,False,True,False,False,False,True,False,False
1,8.300,0.038428,87.3198,6,Medium,174.6396,True,False,False,False,...,True,False,True,False,False,False,True,False,False,False
2,14.600,0.099575,241.7538,15,Very High,241.7538,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True
3,7.315,0.015388,155.0340,6,Low,310.0680,False,False,False,False,...,True,False,True,False,False,False,False,False,True,False
4,13.600,0.118599,234.2300,28,Very High,936.9200,True,False,False,False,...,True,False,False,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5676,10.500,0.013496,141.3154,16,Low,282.6308,True,False,False,False,...,True,False,False,False,True,False,False,True,False,False
5677,7.600,0.142991,169.1448,4,Very High,507.4344,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
5678,10.000,0.073529,118.7440,11,High,237.4880,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
5679,15.300,0.067836,214.6218,6,High,429.2436,True,False,False,True,...,True,False,True,False,False,False,False,False,False,True


In [3]:
# For Item_Visibility_Bin
visibility_bin_mapping = {
    'Low': 0,
    'Medium': 1,
    'High': 2,
    'Very High': 3
}
train['Item_Visibility_Bin'] = train['Item_Visibility_Bin'].map(visibility_bin_mapping)

# Make sure all boolean columns are converted to integers (0/1)
boolean_columns = [col for col in train.columns if train[col].dtype == bool]
for col in boolean_columns:
    train[col] = train[col].astype(int)

# Check for any remaining object/string columns
object_columns = train.select_dtypes(include=['object']).columns
print("Remaining object columns:", object_columns)

# If there are any other object columns, convert them using appropriate methods
# Example: using Label Encoding
from sklearn.preprocessing import LabelEncoder

for col in object_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])

Remaining object columns: Index([], dtype='object')


In [5]:
# For Item_Visibility_Bin
visibility_bin_mapping = {
    'Low': 0,
    'Medium': 1,
    'High': 2,
    'Very High': 3
}
test['Item_Visibility_Bin'] = test['Item_Visibility_Bin'].map(visibility_bin_mapping)

# Make sure all boolean columns are converted to integers (0/1)
boolean_columns = [col for col in test.columns if test[col].dtype == bool]
for col in boolean_columns:
    test[col] = test[col].astype(int)

# Check for any remaining object/string columns
object_columns = test.select_dtypes(include=['object']).columns
print("Remaining object columns:", object_columns)

# If there are any other object columns, convert them using appropriate methods
# Example: using Label Encoding
from sklearn.preprocessing import LabelEncoder

for col in object_columns:
    le = LabelEncoder()
    test[col] = le.fit_transform(test[col])

Remaining object columns: Index([], dtype='object')


In [4]:
train

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+,Item_Outlet_Sales
0,9.300,0.016047,249.8092,14,0,499.6184,0,0,0,0,...,0,0,1,0,0,0,0,0,1,3735.1380
1,5.920,0.019278,48.2692,4,0,144.8076,1,0,0,0,...,0,0,0,0,0,0,0,0,0,443.4228
2,17.500,0.016760,141.6180,14,0,283.2360,0,0,0,0,...,0,0,1,0,0,0,1,0,0,2097.2700
3,19.200,0.068571,182.0950,15,2,182.0950,1,0,0,0,...,0,0,1,0,0,0,0,1,0,732.3800
4,8.930,0.059716,53.8614,26,1,107.7228,0,0,0,0,...,1,0,0,0,1,1,0,0,0,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0.056783,214.5218,26,1,429.0436,0,0,0,0,...,0,0,0,0,1,0,0,0,1,2778.3834
8519,8.380,0.046982,108.1570,11,1,216.3140,1,0,0,0,...,0,0,1,0,0,0,1,0,0,549.2850
8520,10.600,0.035186,85.1224,9,1,170.2448,0,0,0,0,...,1,1,0,0,0,1,0,0,0,1193.1136
8521,7.210,0.145221,103.1332,4,3,309.3996,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1845.5976


In [6]:
X = train.drop('Item_Outlet_Sales', axis=1)
y = train[['Item_Outlet_Sales']]

In [7]:
y

Unnamed: 0,Item_Outlet_Sales
0,3735.1380
1,443.4228
2,2097.2700
3,732.3800
4,994.7052
...,...
8518,2778.3834
8519,549.2850
8520,1193.1136
8521,1845.5976


In [8]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Category_Food,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+
549,9.500,0.035206,171.3448,14,1,342.6896,1,0,0,0,...,1,0,0,1,0,0,0,0,1,0
7757,18.000,0.047473,170.5422,11,1,341.0844,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
764,17.600,0.076122,111.7202,16,2,223.4404,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0
6867,8.325,0.029845,41.6138,11,0,83.2276,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2716,12.850,0.137228,155.5630,16,3,311.1260,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,9.395,0.286345,139.1838,15,3,139.1838,1,0,0,0,...,1,0,0,1,0,0,0,1,0,0
5191,15.600,0.117575,75.6670,6,3,151.3340,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
5390,17.600,0.018944,237.3590,11,0,474.7180,0,0,0,0,...,0,1,0,1,0,0,0,0,0,1
860,20.350,0.054363,117.9466,6,1,235.8932,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0


In [7]:
X_val.shape

(1705, 50)

evalute model performance using rmse and r2 score

In [11]:
def evaluate_model(model, X_train, X_val, y_train, y_val):
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    
    train_r2 = r2_score(y_train, train_preds)
    val_r2 = r2_score(y_val, val_preds)
    
    print(f"Training RMSE: {train_rmse:.2f}")
    print(f"Validation RMSE: {val_rmse:.2f}")
    print(f"Training R²: {train_r2:.4f}")
    print(f"Validation R²: {val_r2:.4f}")
    
    return val_rmse, model

Exp with Diff Models

In [18]:
X_train

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Category_Food,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+
549,9.500,0.035206,171.3448,14,Medium,342.6896,True,False,False,False,...,True,False,False,True,False,False,False,False,True,False
7757,18.000,0.047473,170.5422,11,Medium,341.0844,False,False,False,False,...,False,True,False,True,False,False,False,False,True,False
764,17.600,0.076122,111.7202,16,High,223.4404,True,False,False,False,...,True,False,False,False,True,False,False,True,False,False
6867,8.325,0.029845,41.6138,11,Low,83.2276,False,False,False,False,...,True,False,False,True,False,False,False,False,False,False
2716,12.850,0.137228,155.5630,16,Very High,311.1260,False,False,False,False,...,True,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,9.395,0.286345,139.1838,15,Very High,139.1838,True,False,False,False,...,True,False,False,True,False,False,False,True,False,False
5191,15.600,0.117575,75.6670,6,Very High,151.3340,False,False,False,False,...,True,False,True,False,False,False,True,False,False,False
5390,17.600,0.018944,237.3590,11,Low,474.7180,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True
860,20.350,0.054363,117.9466,6,Medium,235.8932,False,False,False,False,...,True,False,True,False,False,False,False,True,False,False


In [16]:
y_train

549     2386.2272
7757    3103.9596
764     1125.2020
6867     284.2966
2716    4224.5010
          ...    
5734     280.9676
5191    1301.6390
5390    6145.3340
860     1649.8524
7270     965.4100
Name: Item_Outlet_Sales, Length: 6818, dtype: float64

In [12]:
lr = LinearRegression()
lr_rmse, lr_model = evaluate_model(lr, X_train, X_val, y_train, y_val)

Training RMSE: 1104.07
Validation RMSE: 1033.84
Training R²: 0.5879
Validation R²: 0.6068


In [14]:
# Linear Regression

lr = LinearRegression()
lr_rmse, lr_model = evaluate_model(lr, X_train, X_val, y_train, y_val)

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf_rmse, rf_model = evaluate_model(rf, X_train, X_val, y_train, y_val)

# XGBoost 
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_rmse, xgb_model = evaluate_model(xgb_model, X_train, X_val, y_train, y_val)


Training RMSE: 1104.07
Validation RMSE: 1033.84
Training R²: 0.5879
Validation R²: 0.6068
Training RMSE: 432.42
Validation RMSE: 1076.66
Training R²: 0.9368
Validation R²: 0.5735
Training RMSE: 923.98
Validation RMSE: 1042.03
Training R²: 0.7114
Validation R²: 0.6005


In [15]:
#Decision Tree 
dt = DecisionTreeRegressor(random_state=42)
dt_rmse, dt_model = evaluate_model(dt, X_train, X_val, y_train, y_val)

Training RMSE: 0.00
Validation RMSE: 1489.71
Training R²: 1.0000
Validation R²: 0.1835


In [17]:
models = {
    'Linear Regression': lr_rmse,
    'Decision Tree': dt_rmse,
    'Random Forest': rf_rmse,
    'XGBoost': xgb_rmse
}

In [18]:
for model, rmse in sorted(models.items(), key=lambda x: x[1]):
    print(f"{model}: RMSE = {rmse:.2f}")

best_model_name = min(models, key=models.get)
print(f"\nBest model: {best_model_name} with RMSE = {models[best_model_name]:.2f}")

Linear Regression: RMSE = 1033.84
XGBoost: RMSE = 1042.03
Random Forest: RMSE = 1076.66
Decision Tree: RMSE = 1489.71

Best model: Linear Regression with RMSE = 1033.84


In [24]:
test_original = pd.read_csv('test.csv')

# Load or generate your predictions from the Linear Regression model
# If you saved your predictions:
# predictions = pd.read_csv('your_predictions.csv')['Item_Outlet_Sales']
# Or if you need to generate predictions now:
predictions = lr.predict(test)

In [26]:
print("Type of predictions:", type(predictions))
print("Shape of predictions:", predictions.shape)

Type of predictions: <class 'numpy.ndarray'>
Shape of predictions: (5681, 1)


In [27]:
if hasattr(predictions, 'ndim') and predictions.ndim > 1:
    predictions = predictions.flatten()
    print("Flattened predictions shape:", predictions.shape)

Flattened predictions shape: (5681,)


In [28]:
print("Length of test set:", len(test_original))
print("Length of predictions:", len(predictions))



Length of test set: 5681
Length of predictions: 5681


In [30]:
submission = pd.DataFrame({
    'Item_Identifier': test_original['Item_Identifier'].values,  # Explicitly get values
    'Outlet_Identifier': test_original['Outlet_Identifier'].values,  # Explicitly get values
    'Item_Outlet_Sales': np.round(predictions).astype(int)  # Round and convert to integer
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1870
1,FDW14,OUT017,1523
2,NCN55,OUT010,1188
3,FDQ58,OUT017,2579
4,FDY38,OUT027,6099
