# BigMart Sales Prediction - Feature Engineering


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:
# Load data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

print(f"Training data: {train_data.shape}")
print(f"Test data: {test_data.shape}")

Training data: (8523, 12)
Test data: (5681, 11)


In [3]:
# Combine datasets for consistent feature engineering
train_copy = train_data.copy()
test_copy = test_data.copy()

train_copy['source'] = 'train'
test_copy['source'] = 'test'
test_copy['Item_Outlet_Sales'] = 0

combined_data = pd.concat([train_copy, test_copy], ignore_index=True)
print(f"Combined data shape: {combined_data.shape}")

Combined data shape: (14204, 13)


In [4]:
# Handle missing values - Item_Weight
print("Handling Item_Weight missing values...")
print(f"Missing Item_Weight: {combined_data['Item_Weight'].isnull().sum()}")

# Use item identifier to fill missing weights
weight_by_item = combined_data.groupby('Item_Identifier')['Item_Weight'].mean()
missing_weight_mask = combined_data['Item_Weight'].isna()

for idx in combined_data[missing_weight_mask].index:
    item_id = combined_data.loc[idx, 'Item_Identifier']
    if item_id in weight_by_item and not pd.isna(weight_by_item[item_id]):
        combined_data.loc[idx, 'Item_Weight'] = weight_by_item[item_id]

# Use item type for remaining missing values
weight_by_type = combined_data.groupby('Item_Type')['Item_Weight'].mean()
still_missing = combined_data['Item_Weight'].isna()
for idx in combined_data[still_missing].index:
    item_type = combined_data.loc[idx, 'Item_Type']
    combined_data.loc[idx, 'Item_Weight'] = weight_by_type[item_type]

# Fill any remaining with overall mean
combined_data['Item_Weight'].fillna(combined_data['Item_Weight'].mean(), inplace=True)

print(f"Remaining missing Item_Weight: {combined_data['Item_Weight'].isnull().sum()}")

Handling Item_Weight missing values...
Missing Item_Weight: 2439
Remaining missing Item_Weight: 0


In [5]:
# Handle missing values - Outlet_Size
print("Handling Outlet_Size missing values...")
print(f"Missing Outlet_Size: {combined_data['Outlet_Size'].isnull().sum()}")

# Fill based on Outlet_Type and Location patterns
for outlet_type in combined_data['Outlet_Type'].unique():
    for location in combined_data['Outlet_Location_Type'].unique():
        mask = ((combined_data['Outlet_Type'] == outlet_type) & 
               (combined_data['Outlet_Location_Type'] == location))
        subset = combined_data[mask]
        
        if len(subset) > 0 and not subset['Outlet_Size'].mode().empty:
            mode_size = subset['Outlet_Size'].mode()[0]
            null_mask = mask & combined_data['Outlet_Size'].isna()
            combined_data.loc[null_mask, 'Outlet_Size'] = mode_size

# Fill remaining by outlet type
for outlet_type in combined_data['Outlet_Type'].unique():
    type_mask = combined_data['Outlet_Type'] == outlet_type
    if not combined_data[type_mask]['Outlet_Size'].mode().empty:
        mode_size = combined_data[type_mask]['Outlet_Size'].mode()[0]
        null_mask = type_mask & combined_data['Outlet_Size'].isna()
        combined_data.loc[null_mask, 'Outlet_Size'] = mode_size

print(f"Remaining missing Outlet_Size: {combined_data['Outlet_Size'].isnull().sum()}")

Handling Outlet_Size missing values...
Missing Outlet_Size: 4016
Remaining missing Outlet_Size: 0


In [6]:
# Standardize Item_Fat_Content
print("Standardizing Item_Fat_Content...")
print("Original values:", combined_data['Item_Fat_Content'].unique())

fat_content_mapping = {
    'low fat': 'Low Fat',
    'LF': 'Low Fat', 
    'reg': 'Regular',
    'LOW FAT': 'Low Fat',
    'REGULAR': 'Regular'
}

combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace(fat_content_mapping)
print("Standardized values:", combined_data['Item_Fat_Content'].unique())

Standardizing Item_Fat_Content...
Original values: ['Low Fat' 'Regular' 'low fat' 'LF' 'reg']
Standardized values: ['Low Fat' 'Regular']


In [7]:
# Create temporal features
print("Creating temporal features...")

combined_data['Outlet_Years_Operating'] = 2013 - combined_data['Outlet_Establishment_Year']

# Categorize outlet age
combined_data['Outlet_Age_Category'] = pd.cut(
    combined_data['Outlet_Years_Operating'], 
    bins=[0, 8, 15, 30], 
    labels=['New', 'Established', 'Mature']
)

print("Outlet age categories:")
print(combined_data['Outlet_Age_Category'].value_counts())

Creating temporal features...
Outlet age categories:
Outlet_Age_Category
Established    5573
Mature         5542
New            3089
Name: count, dtype: int64


In [8]:
# Extract item identifier features
print("Creating item identifier features...")

combined_data['Item_Category_Code'] = combined_data['Item_Identifier'].str[:2]
combined_data['Item_Numeric_ID'] = pd.to_numeric(
    combined_data['Item_Identifier'].str[2:], errors='coerce'
).fillna(0).astype(int)

print("Item category codes:")
print(combined_data['Item_Category_Code'].value_counts())

Creating item identifier features...
Item category codes:
Item_Category_Code
FD    10201
NC     2686
DR     1317
Name: count, dtype: int64


In [9]:
# Create economic features
print("Creating economic features...")

combined_data['Price_per_Weight_Ratio'] = combined_data['Item_MRP'] / combined_data['Item_Weight']
combined_data['High_Value_Item'] = (
    combined_data['Item_MRP'] > combined_data['Item_MRP'].quantile(0.75)
).astype(int)

print(f"High value items: {combined_data['High_Value_Item'].sum()}")
print(f"Price/Weight ratio range: {combined_data['Price_per_Weight_Ratio'].min():.2f} - {combined_data['Price_per_Weight_Ratio'].max():.2f}")

Creating economic features...
High value items: 3549
Price/Weight ratio range: 1.67 - 50.78


In [10]:
# Handle item visibility
print("Processing item visibility...")

combined_data['Has_Zero_Visibility'] = (combined_data['Item_Visibility'] == 0).astype(int)
print(f"Items with zero visibility: {combined_data['Has_Zero_Visibility'].sum()}")

# Correct zero visibility using item type averages
visibility_by_type = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Type')['Item_Visibility'].mean()
combined_data['Item_Visibility_Corrected'] = combined_data['Item_Visibility'].copy()

zero_visibility_mask = combined_data['Item_Visibility'] == 0
for idx in combined_data[zero_visibility_mask].index:
    item_type = combined_data.loc[idx, 'Item_Type']
    combined_data.loc[idx, 'Item_Visibility_Corrected'] = visibility_by_type[item_type]

print(f"Corrected visibility range: {combined_data['Item_Visibility_Corrected'].min():.6f} - {combined_data['Item_Visibility_Corrected'].max():.6f}")

Processing item visibility...
Items with zero visibility: 879
Corrected visibility range: 0.003575 - 0.328391


In [11]:
# Create categorical bins
print("Creating categorical bins...")

# MRP price segments
combined_data['MRP_Price_Segment'] = pd.qcut(
    combined_data['Item_MRP'], q=5, 
    labels=['Budget', 'Economy', 'Standard', 'Premium', 'Luxury']
)

# Weight categories
combined_data['Weight_Category'] = pd.qcut(
    combined_data['Item_Weight'], q=4, 
    labels=['Light', 'Medium', 'Heavy', 'VeryHeavy']
)

print("MRP segments:")
print(combined_data['MRP_Price_Segment'].value_counts())
print("\nWeight categories:")
print(combined_data['Weight_Category'].value_counts())

Creating categorical bins...
MRP segments:
MRP_Price_Segment
Economy     2842
Budget      2841
Premium     2841
Standard    2840
Luxury      2840
Name: count, dtype: int64

Weight categories:
Weight_Category
Medium       3636
Light        3575
VeryHeavy    3508
Heavy        3485
Name: count, dtype: int64


In [12]:
# Create store performance features
print("Creating store performance features...")

train_mask = combined_data['source'] == 'train'
if train_mask.sum() > 0:
    store_performance = combined_data[train_mask].groupby('Outlet_Identifier')['Item_Outlet_Sales'].agg([
        'mean', 'std', 'count', 'median', 'min', 'max'
    ])
    store_performance.columns = [
        'Store_Avg_Sales', 'Store_Sales_Std', 'Store_Product_Count', 
        'Store_Median_Sales', 'Store_Min_Sales', 'Store_Max_Sales'
    ]
    store_performance['Store_Sales_Range'] = (
        store_performance['Store_Max_Sales'] - store_performance['Store_Min_Sales']
    )
    
    combined_data = combined_data.merge(
        store_performance, left_on='Outlet_Identifier', right_index=True, how='left'
    )
    
    # Fill missing values for test data
    overall_stats = {
        'Store_Avg_Sales': combined_data[train_mask]['Item_Outlet_Sales'].mean(),
        'Store_Sales_Std': combined_data[train_mask]['Item_Outlet_Sales'].std(),
        'Store_Product_Count': combined_data[train_mask].groupby('Outlet_Identifier').size().mean(),
        'Store_Median_Sales': combined_data[train_mask]['Item_Outlet_Sales'].median(),
        'Store_Min_Sales': combined_data[train_mask]['Item_Outlet_Sales'].min(),
        'Store_Max_Sales': combined_data[train_mask]['Item_Outlet_Sales'].max(),
        'Store_Sales_Range': combined_data[train_mask]['Item_Outlet_Sales'].max() - combined_data[train_mask]['Item_Outlet_Sales'].min()
    }
    
    for col, value in overall_stats.items():
        combined_data[col].fillna(value, inplace=True)

print("Store performance features created")

Creating store performance features...
Store performance features created


In [13]:
# Create market positioning features
print("Creating market positioning features...")

combined_data['Market_Segment'] = 'Standard'

# Premium segment
premium_mask = (
    (combined_data['Item_MRP'] > combined_data['Item_MRP'].quantile(0.8)) & 
    (combined_data['Item_Fat_Content'] == 'Low Fat')
)
combined_data.loc[premium_mask, 'Market_Segment'] = 'Premium'

# Budget segment
budget_mask = combined_data['Item_MRP'] < combined_data['Item_MRP'].quantile(0.2)
combined_data.loc[budget_mask, 'Market_Segment'] = 'Budget'

print("Market segments:")
print(combined_data['Market_Segment'].value_counts())

Creating market positioning features...
Market segments:
Market_Segment
Standard    9539
Budget      2841
Premium     1824
Name: count, dtype: int64


In [14]:
# Create interaction features
print("Creating interaction features...")

combined_data['Outlet_Item_Interaction'] = (
    combined_data['Outlet_Type'] + '_' + combined_data['Item_Type']
)
combined_data['Size_Location_Interaction'] = (
    combined_data['Outlet_Size'].fillna('Unknown') + '_' + 
    combined_data['Outlet_Location_Type']
)

print(f"Outlet-Item interactions: {combined_data['Outlet_Item_Interaction'].nunique()}")
print(f"Size-Location interactions: {combined_data['Size_Location_Interaction'].nunique()}")

Creating interaction features...
Outlet-Item interactions: 64
Size-Location interactions: 6


In [15]:
# Create advanced ratio features
print("Creating advanced ratio features...")

# Visibility ratio compared to item type average
item_type_avg_visibility = combined_data.groupby('Item_Type')['Item_Visibility_Corrected'].mean()
combined_data['Visibility_Type_Ratio'] = combined_data.apply(
    lambda x: x['Item_Visibility_Corrected'] / item_type_avg_visibility[x['Item_Type']], axis=1
)

# MRP ratio compared to item type average
item_type_avg_mrp = combined_data.groupby('Item_Type')['Item_MRP'].mean()
combined_data['MRP_Type_Ratio'] = combined_data.apply(
    lambda x: x['Item_MRP'] / item_type_avg_mrp[x['Item_Type']], axis=1
)

print(f"Visibility ratio range: {combined_data['Visibility_Type_Ratio'].min():.2f} - {combined_data['Visibility_Type_Ratio'].max():.2f}")
print(f"MRP ratio range: {combined_data['MRP_Type_Ratio'].min():.2f} - {combined_data['MRP_Type_Ratio'].max():.2f}")

Creating advanced ratio features...
Visibility ratio range: 0.05 - 5.16
MRP ratio range: 0.22 - 2.08


In [16]:
# Encode categorical features
print("Encoding categorical features...")

categorical_features = [
    'Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 
    'Outlet_Type', 'Item_Category_Code', 'Outlet_Age_Category', 'MRP_Price_Segment', 
    'Weight_Category', 'Market_Segment', 'Outlet_Item_Interaction', 'Size_Location_Interaction'
]

label_encoders = {}
for feature in categorical_features:
    if feature in combined_data.columns:
        combined_data[feature] = combined_data[feature].astype(str).replace('nan', 'Unknown')
        le = LabelEncoder()
        combined_data[feature] = le.fit_transform(combined_data[feature])
        label_encoders[feature] = le
        print(f"Encoded {feature}: {len(le.classes_)} unique values")

print(f"\nTotal features encoded: {len(label_encoders)}")

Encoding categorical features...
Encoded Item_Fat_Content: 2 unique values
Encoded Item_Type: 16 unique values
Encoded Outlet_Size: 3 unique values
Encoded Outlet_Location_Type: 3 unique values
Encoded Outlet_Type: 4 unique values
Encoded Item_Category_Code: 3 unique values
Encoded Outlet_Age_Category: 3 unique values
Encoded MRP_Price_Segment: 5 unique values
Encoded Weight_Category: 4 unique values
Encoded Market_Segment: 3 unique values
Encoded Outlet_Item_Interaction: 64 unique values
Encoded Size_Location_Interaction: 6 unique values

Total features encoded: 12


In [17]:
# Feature summary
print("Feature Engineering Summary:")
print("=" * 40)
print(f"Original features: {len(train_data.columns)-1}")
print(f"Final features: {len(combined_data.columns)-3}")
print(f"Features added: {len(combined_data.columns)-3-(len(train_data.columns)-1)}")

print("\nNew features created:")
new_features = [
    'Outlet_Years_Operating', 'Outlet_Age_Category', 'Item_Category_Code', 'Item_Numeric_ID',
    'Price_per_Weight_Ratio', 'High_Value_Item', 'Has_Zero_Visibility', 'Item_Visibility_Corrected',
    'MRP_Price_Segment', 'Weight_Category', 'Store_Avg_Sales', 'Store_Sales_Std', 'Store_Product_Count',
    'Store_Median_Sales', 'Store_Min_Sales', 'Store_Max_Sales', 'Store_Sales_Range',
    'Market_Segment', 'Outlet_Item_Interaction', 'Size_Location_Interaction',
    'Visibility_Type_Ratio', 'MRP_Type_Ratio'
]

for feature in new_features:
    if feature in combined_data.columns:
        print(f"- {feature}")

print(f"\nFinal dataset shape: {combined_data.shape}")
print(f"Missing values: {combined_data.isnull().sum().sum()}")

Feature Engineering Summary:
Original features: 11
Final features: 32
Features added: 21

New features created:
- Outlet_Years_Operating
- Outlet_Age_Category
- Item_Category_Code
- Item_Numeric_ID
- Price_per_Weight_Ratio
- High_Value_Item
- Has_Zero_Visibility
- Item_Visibility_Corrected
- MRP_Price_Segment
- Weight_Category
- Store_Avg_Sales
- Store_Sales_Std
- Store_Product_Count
- Store_Median_Sales
- Store_Min_Sales
- Store_Max_Sales
- Store_Sales_Range
- Market_Segment
- Outlet_Item_Interaction
- Size_Location_Interaction
- Visibility_Type_Ratio
- MRP_Type_Ratio

Final dataset shape: (14204, 35)
Missing values: 0


In [24]:
# Split back to train and test
train_engineered = combined_data[combined_data['source'] == 'train'].copy()
test_engineered = combined_data[combined_data['source'] == 'test'].copy()

# Remove helper columns
train_engineered = train_engineered.drop('source', axis=1)
test_engineered = test_engineered.drop(['source', 'Item_Outlet_Sales'], axis=1)

print(f"Engineered training data: {train_engineered.shape}")
print(f"Engineered test data: {test_engineered.shape}")

# Save engineered datasets
print("\nSaving engineered datasets...")
train_engineered.to_csv('outputs/train_engineered.csv', index=False)
test_engineered.to_csv('outputs/test_engineered.csv', index=False)
print("Feature engineering completed successfully!")

Engineered training data: (8523, 34)
Engineered test data: (5681, 33)

Saving engineered datasets...
Feature engineering completed successfully!
