In [1]:
import pandas as pd

In [2]:
test_data=pd.read_csv("test_data_cleaned.csv")
train_data=pd.read_csv("train_data_cleaned.csv")

### Encoding

In [4]:
#bianry encoding for Item_Fat_Content
fat_content_mapping = {
    'low_fat': 0,
    'regular': 1,
}
for data in [train_data,test_data]:
    data['Item_Fat_Content'] = data['Item_Fat_Content'].map(fat_content_mapping)

In [5]:
#ordinal encoding
outlet_size_mapping = {'Small': 1, 'Medium': 2, 'High': 3}
outlet_location_type_mapping = {'Tier 3': 1, 'Tier 2': 2, 'Tier 1': 3}
for df in [train_data, test_data]:
    df['Outlet_Size'] = df['Outlet_Size'].map(outlet_size_mapping)
    df['Outlet_Location_Type'] = df['Outlet_Location_Type'].map(outlet_location_type_mapping)

In [6]:
train_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,2,3,Supermarket Type1,3735.138
1,DRC01,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,2,1,Supermarket Type2,443.4228
2,FDN15,17.5,0,0.01676,Meat,141.618,OUT049,1999,2,3,Supermarket Type1,2097.27
3,FDX07,19.2,1,0.053931,Fruits and Vegetables,182.095,OUT010,1998,2,1,Grocery Store,732.38
4,NCD19,8.93,0,0.053931,Household,53.8614,OUT013,1987,3,1,Supermarket Type1,994.7052


In [7]:
test_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.75,0,0.007565,Snack Foods,107.8622,OUT049,1999,2,3,Supermarket Type1
1,FDW14,8.3,1,0.038428,Dairy,87.3198,OUT017,2007,1,2,Supermarket Type1
2,NCN55,14.6,0,0.099575,Others,241.7538,OUT010,1998,2,1,Grocery Store
3,FDQ58,7.315,0,0.015388,Snack Foods,155.034,OUT017,2007,1,2,Supermarket Type1
4,FDY38,12.5,1,0.118599,Dairy,234.23,OUT027,1985,2,1,Supermarket Type3


In [8]:
#one-hot encoding for 'Outlet_Type'
train_data_encoded = pd.get_dummies(train_data, columns=['Outlet_Type'], prefix='Outlet_Type')
train_data_encoded = train_data_encoded.astype({col: 'int' for col in train_data_encoded.columns if col.startswith('Outlet_Type_')})
test_data_encoded = pd.get_dummies(test_data, columns=['Outlet_Type'], prefix='Outlet_Type')
test_data_encoded = test_data_encoded.astype({col: 'int' for col in test_data_encoded.columns if col.startswith('Outlet_Type_')})

In [9]:
train_data_encoded.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,FDA15,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,2,3,3735.138,0,1,0,0
1,DRC01,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,2,1,443.4228,0,0,1,0
2,FDN15,17.5,0,0.01676,Meat,141.618,OUT049,1999,2,3,2097.27,0,1,0,0
3,FDX07,19.2,1,0.053931,Fruits and Vegetables,182.095,OUT010,1998,2,1,732.38,1,0,0,0
4,NCD19,8.93,0,0.053931,Household,53.8614,OUT013,1987,3,1,994.7052,0,1,0,0


In [10]:
#we can get the age of outlet using its establishment year
for data in [train_data_encoded,test_data_encoded]:
    data['Outlet_Years'] = 2013 - data['Outlet_Establishment_Year']

In [11]:
#using item_identifier we can derive the category of item
def map_item_category(item_identifier):
    if item_identifier.startswith('FD'):
        return 'Food'
    elif item_identifier.startswith('DR'):
        return 'Drink'
    elif item_identifier.startswith('NC'):
        return 'Non_Consumable'
    else:
        return 'Unknown'
train_data_encoded['Item_Category'] = train_data_encoded['Item_Identifier'].apply(map_item_category)
test_data_encoded['Item_Category'] = test_data_encoded['Item_Identifier'].apply(map_item_category)

In [12]:
#one-hot encoding for 'Item_Category'
train_data_encoded = pd.get_dummies(train_data_encoded, columns=['Item_Category'], prefix='Item_Category')
train_data_encoded = train_data_encoded.astype({col: 'int' for col in train_data_encoded.columns if col.startswith('Item_Category_')})
test_data_encoded = pd.get_dummies(test_data_encoded, columns=['Item_Category'], prefix='Item_Category')
test_data_encoded = test_data_encoded.astype({col: 'int' for col in test_data_encoded.columns if col.startswith('Item_Category_')})

In [13]:
train_data_encoded.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Years,Item_Category_Drink,Item_Category_Food,Item_Category_Non_Consumable
0,FDA15,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,2,3,3735.138,0,1,0,0,14,0,1,0
1,DRC01,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,2,1,443.4228,0,0,1,0,4,1,0,0
2,FDN15,17.5,0,0.01676,Meat,141.618,OUT049,1999,2,3,2097.27,0,1,0,0,14,0,1,0
3,FDX07,19.2,1,0.053931,Fruits and Vegetables,182.095,OUT010,1998,2,1,732.38,1,0,0,0,15,0,1,0
4,NCD19,8.93,0,0.053931,Household,53.8614,OUT013,1987,3,1,994.7052,0,1,0,0,26,0,0,1


In [14]:
#removing irrelevant columns
columns_to_remove = ['Item_Type', 'Outlet_Establishment_Year']
train_data_final = train_data_encoded.drop(columns=columns_to_remove)
test_data_final = test_data_encoded.drop(columns=columns_to_remove)

In [15]:
train_data_final.shape

(8523, 17)

In [16]:
test_data_final.shape

(5681, 16)

In [17]:
train_data_final.to_csv('train_data_final.csv',index=False)
test_data_final.to_csv('test_data_final.csv',index=False)