In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data['source'] = 'train'
test_data['source'] = 'test'

In [4]:
data = pd.concat([train_data, test_data], ignore_index=True)

Handling Missing Values

In [5]:
item_avg_weight = data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
data['Item_Weight'].fillna(item_avg_weight, inplace=True)
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train
3,FDX07,19.20,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800,train
4,NCD19,8.93,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1,,test
14202,FDJ26,15.30,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1,,test


In [6]:
data.isnull().sum() # handled item weight missing values, ignore sales as it is in test data

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

In [7]:
outlet_size_mode = data.pivot_table(values='Outlet_Size', columns='Outlet_Type', 
                                    aggfunc=lambda x: x.mode()[0] if not x.mode().empty else np.nan)
outlet_size_mode

Outlet_Type,Grocery Store,Supermarket Type1,Supermarket Type2,Supermarket Type3
Outlet_Size,Small,Small,Medium,Medium


In [8]:
def impute_size(row):
    if pd.isnull(row['Outlet_Size']):
        return outlet_size_mode[row['Outlet_Type']]
    else:
        return row['Outlet_Size']
    
data['Outlet_Size'] = data.apply(impute_size, axis=1)
data.isnull().sum() # handled outlet size missing values

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales            5681
source                          0
dtype: int64

Standardizing the values

In [9]:
data['Item_Fat_Content'] = data['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

In [10]:
#zero visibility values
visibility_means = data.groupby('Item_Type')['Item_Visibility'].transform('mean')
# Replace 0 visibility values with the mean of their respective Item_Type
zero_indices = data['Item_Visibility'] == 0
data.loc[zero_indices, 'Item_Visibility'] = visibility_means[zero_indices]
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train
3,FDX07,19.20,Regular,0.068571,Fruits and Vegetables,182.0950,OUT010,1998,"Outlet_Size Small Name: Grocery Store, dtyp...",Tier 3,Grocery Store,732.3800,train
4,NCD19,8.93,Low Fat,0.059716,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test
14202,FDJ26,15.30,Regular,0.067836,Canned,214.6218,OUT017,2007,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test


In [11]:
(data['Item_Visibility'] == 0).sum()

0

In [12]:
data['Item_Category'] = data['Item_Identifier'].apply(lambda x: x[:2])
# Map to meaningful categories
data['Item_Category'] = data['Item_Category'].map({
    'FD': 'Food',
    'DR': 'Drinks',
    'NC': 'Non-Consumable'
})
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Category
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train,Food
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train,Food
3,FDX07,19.20,Regular,0.068571,Fruits and Vegetables,182.0950,OUT010,1998,"Outlet_Size Small Name: Grocery Store, dtyp...",Tier 3,Grocery Store,732.3800,train,Food
4,NCD19,8.93,Low Fat,0.059716,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-Consumable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test,Food
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test,Food
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Non-Consumable
14202,FDJ26,15.30,Regular,0.067836,Canned,214.6218,OUT017,2007,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Food


In [13]:
data['Outlet_Age'] = 2013 - data['Outlet_Establishment_Year']
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Category,Outlet_Age
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train,Food,14
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks,4
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train,Food,14
3,FDX07,19.20,Regular,0.068571,Fruits and Vegetables,182.0950,OUT010,1998,"Outlet_Size Small Name: Grocery Store, dtyp...",Tier 3,Grocery Store,732.3800,train,Food,15
4,NCD19,8.93,Low Fat,0.059716,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-Consumable,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test,Food,16
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test,Food,4
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Non-Consumable,11
14202,FDJ26,15.30,Regular,0.067836,Canned,214.6218,OUT017,2007,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Food,6


In [22]:
data['Outlet_Age_Bin'] = pd.cut(
    data['Outlet_Age'], 
    bins=[0, 5, 10, 15, 20, 30], 
    labels=['0-5', '6-10', '11-15', '16-20', '20+']
)

In [14]:
data['Item_MRP_Bin'] = pd.cut(
    data['Item_MRP'], 
    bins=[0, 50, 100, 150, 200, 300], 
    labels=['0-50', '50-100', '100-150', '150-200', '200+']
)

In [15]:
data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,source,Item_Category,Outlet_Age,Item_MRP_Bin
0,FDA15,9.30,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,train,Food,14,200+
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,train,Drinks,4,0-50
2,FDN15,17.50,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,train,Food,14,100-150
3,FDX07,19.20,Regular,0.068571,Fruits and Vegetables,182.0950,OUT010,1998,"Outlet_Size Small Name: Grocery Store, dtyp...",Tier 3,Grocery Store,732.3800,train,Food,15,150-200
4,NCD19,8.93,Low Fat,0.059716,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,train,Non-Consumable,26,50-100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1,,test,Food,16,100-150
14200,FDD47,7.60,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2,,test,Food,4,150-200
14201,NCO17,10.00,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Non-Consumable,11,100-150
14202,FDJ26,15.30,Regular,0.067836,Canned,214.6218,OUT017,2007,"Outlet_Size Small Name: Supermarket Type1, ...",Tier 2,Supermarket Type1,,test,Food,6,200+


In [16]:
data['Item_Visibility_Bin'] = pd.qcut(
    data['Item_Visibility'], 
    q=4, 
    labels=['Low', 'Medium', 'High', 'Very High']
)

In [17]:
data['Item_MRP_By_Outlet_Type'] = data['Item_MRP'] * data['Outlet_Type'].map({
    'Grocery Store': 1,
    'Supermarket Type1': 2,
    'Supermarket Type2': 3,
    'Supermarket Type3': 4
})

ENCODING CATEGORICAL VARIABLES

In [20]:
categorical_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier',
                       'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type',
                       'Item_Category', 'Outlet_Age_Bin', 'Item_MRP_Bin']

numerical_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 
                     'Outlet_Age', 'Item_Visibility_Normalized']

In [23]:
data_encoded = pd.get_dummies(data, columns=categorical_features, drop_first=True)

FEATURE SELECTION

In [24]:
cols_to_drop = ['Item_Identifier', 'Outlet_Establishment_Year', 'source']

In [25]:
if 'Item_Outlet_Sales' in data_encoded.columns:
    cols_to_drop.append('Item_Outlet_Sales')

In [26]:
data_encoded

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales,source,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,...,Item_Category_Food,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+
0,FDA15,9.30,0.016047,249.8092,1999,3735.1380,train,14,Low,499.6184,...,True,False,False,True,False,False,False,False,False,True
1,DRC01,5.92,0.019278,48.2692,2009,443.4228,train,4,Low,144.8076,...,False,False,False,False,False,False,False,False,False,False
2,FDN15,17.50,0.016760,141.6180,1999,2097.2700,train,14,Low,283.2360,...,True,False,False,True,False,False,False,True,False,False
3,FDX07,19.20,0.068571,182.0950,1998,732.3800,train,15,High,182.0950,...,True,False,False,True,False,False,False,False,True,False
4,NCD19,8.93,0.059716,53.8614,1987,994.7052,train,26,Medium,107.7228,...,False,True,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,FDB58,10.50,0.013496,141.3154,1997,,test,16,Low,282.6308,...,True,False,False,False,True,False,False,True,False,False
14200,FDD47,7.60,0.142991,169.1448,2009,,test,4,Very High,507.4344,...,True,False,False,False,False,False,False,False,True,False
14201,NCO17,10.00,0.073529,118.7440,2002,,test,11,High,237.4880,...,False,True,False,True,False,False,False,True,False,False
14202,FDJ26,15.30,0.067836,214.6218,2007,,test,6,High,429.2436,...,True,False,True,False,False,False,False,False,False,True


In [27]:
data_processed = data_encoded.drop(columns=cols_to_drop)
data_processed

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Age,Item_Visibility_Bin,Item_MRP_By_Outlet_Type,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,...,Item_Category_Food,Item_Category_Non-Consumable,Outlet_Age_Bin_6-10,Outlet_Age_Bin_11-15,Outlet_Age_Bin_16-20,Outlet_Age_Bin_20+,Item_MRP_Bin_50-100,Item_MRP_Bin_100-150,Item_MRP_Bin_150-200,Item_MRP_Bin_200+
0,9.30,0.016047,249.8092,14,Low,499.6184,False,False,False,False,...,True,False,False,True,False,False,False,False,False,True
1,5.92,0.019278,48.2692,4,Low,144.8076,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,17.50,0.016760,141.6180,14,Low,283.2360,False,False,False,False,...,True,False,False,True,False,False,False,True,False,False
3,19.20,0.068571,182.0950,15,High,182.0950,True,False,False,False,...,True,False,False,True,False,False,False,False,True,False
4,8.93,0.059716,53.8614,26,Medium,107.7228,False,False,False,False,...,False,True,False,False,False,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14199,10.50,0.013496,141.3154,16,Low,282.6308,True,False,False,False,...,True,False,False,False,True,False,False,True,False,False
14200,7.60,0.142991,169.1448,4,Very High,507.4344,True,False,False,False,...,True,False,False,False,False,False,False,False,True,False
14201,10.00,0.073529,118.7440,11,High,237.4880,False,False,False,False,...,False,True,False,True,False,False,False,True,False,False
14202,15.30,0.067836,214.6218,6,High,429.2436,True,False,False,True,...,True,False,True,False,False,False,False,False,False,True


In [28]:
target = train_data['Item_Outlet_Sales'].copy()

In [28]:
train_idx = data[data['source'] == 'train'].index
test_idx = data[data['source'] == 'test'].index

# Split the processed data
X_train = data_processed.iloc[train_idx]
X_test = data_processed.iloc[test_idx]

# Reattach the target variable to the training set
X_train['Item_Outlet_Sales'] = target.values

print("\nProcessed train shape:", X_train.shape)
print("Processed test shape:", X_test.shape)

# 7. FINAL CHECK
# Check if there are any missing values left
print("\nMissing values in processed train data:")
print(X_train.isnull().sum().sum())

print("\nMissing values in processed test data:")
print(X_test.isnull().sum().sum())

# 8. SAVE PROCESSED DATA
X_train.to_csv('train_processed.csv', index=False)
X_test.to_csv('test_processed.csv', index=False)

NameError: name 'data_processed' is not defined