# Data Preprocessing

In [332]:
import numpy as np 
import pandas as pd

In [333]:
df = pd.read_csv("dataset1_filtering_done.csv")

In [334]:
df.head()

Unnamed: 0.1,Unnamed: 0,Category,Description,carbs,protein,fat,fiber,calories,goal_tag,diet_type
0,0,milk,"milk, nfs",4.87,3.34,1.99,0.0,50.75,weight_loss,veg
1,1,milk,"milk, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
2,2,milk,"milk, low sodium, whole",4.46,3.1,3.46,0.0,61.38,weight_loss,veg
3,3,milk,"milk, calcium fortified, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
4,4,milk,"milk, calcium fortified, low fat (1%)",5.19,3.38,0.95,0.0,42.83,weight_loss,veg


In [335]:
### Checking for null values

In [336]:
df.isnull().sum()

Unnamed: 0     0
Category       0
Description    0
carbs          0
protein        0
fat            0
fiber          0
calories       0
goal_tag       0
diet_type      0
dtype: int64

## Handling Category column 

In [338]:
df.drop(df.columns[0],axis=1,inplace=True)

In [339]:
df["Category"].value_counts()

Category
rice                                                        142
bread                                                       119
potato                                                      105
cookie                                                      100
coffee                                                       91
                                                           ... 
veal fricassee                                                1
banana split                                                  1
veal stew with potatoes and vegetables including carrots      1
soft serve                                                    1
dark-green leafy vegetable soup with meat                     1
Name: count, Length: 2294, dtype: int64

In [340]:
df.head()

Unnamed: 0,Category,Description,carbs,protein,fat,fiber,calories,goal_tag,diet_type
0,milk,"milk, nfs",4.87,3.34,1.99,0.0,50.75,weight_loss,veg
1,milk,"milk, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
2,milk,"milk, low sodium, whole",4.46,3.1,3.46,0.0,61.38,weight_loss,veg
3,milk,"milk, calcium fortified, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
4,milk,"milk, calcium fortified, low fat (1%)",5.19,3.38,0.95,0.0,42.83,weight_loss,veg


So our categorical column is spread out in such a way that some categories have a lot of items eg rice has  142 items , Wheresas
some categories have as low as one item eg Supplements .
But there are some high value categories that has been manually added by me that werent in the dataset before eg Supplements and it is vital to a balanced vegeratian diet .

## The Plan: -
The plan is we are going to perform frequency encoding to categorical column. Therefore each category will be represented by their frequency.
For the scaresly populated categories we will drop it.
BUT we will be leaving important categories such as Supplements , Plant Protein and Dairy as it is sine it is vital information for the model

In [343]:
important_scarse_categories = [
    "Dairy",
    "Plant Protein",
    "Supplements" 
]

In [344]:
category_counts = df["Category"].value_counts()

rare_categories = category_counts[
    (category_counts < 10) & (~category_counts.index.isin(important_scarse_categories))
].index

In [345]:
df.drop(df[df["Category"].isin(rare_categories)].index,inplace=True)

In [346]:
df["Category"].value_counts().tail(10)

Category
tortilla chips           10
tortellini               10
pizza with pepperoni     10
potato salad with egg    10
fruit juice drink        10
water                    10
energy drink             10
Plant Protein             2
Supplements               1
Dairy                     1
Name: count, dtype: int64

Categories less that 10 entries dropped but left the important categories

In [348]:
df.head()

Unnamed: 0,Category,Description,carbs,protein,fat,fiber,calories,goal_tag,diet_type
0,milk,"milk, nfs",4.87,3.34,1.99,0.0,50.75,weight_loss,veg
1,milk,"milk, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
2,milk,"milk, low sodium, whole",4.46,3.1,3.46,0.0,61.38,weight_loss,veg
3,milk,"milk, calcium fortified, whole",4.67,3.28,3.2,0.0,60.6,weight_loss,veg
4,milk,"milk, calcium fortified, low fat (1%)",5.19,3.38,0.95,0.0,42.83,weight_loss,veg


### Standardizing Numerical Columns
using StandardScaler

In [350]:
from sklearn.preprocessing import StandardScaler

In [351]:
scaler = StandardScaler()

In [352]:
num_cols = ["carbs","protein","fat","fiber","calories"]
df[num_cols] = scaler.fit_transform(df[num_cols])

In [353]:
df.head()

Unnamed: 0,Category,Description,carbs,protein,fat,fiber,calories,goal_tag,diet_type
0,milk,"milk, nfs",-0.803888,-0.708719,-0.802787,-0.756657,-1.161451,weight_loss,veg
1,milk,"milk, whole",-0.812894,-0.715474,-0.657086,-0.756657,-1.088393,weight_loss,veg
2,milk,"milk, low sodium, whole",-0.82235,-0.735738,-0.625779,-0.756657,-1.082608,weight_loss,veg
3,milk,"milk, calcium fortified, whole",-0.812894,-0.715474,-0.657086,-0.756657,-1.088393,weight_loss,veg
4,milk,"milk, calcium fortified, low fat (1%)",-0.789478,-0.704216,-0.928017,-0.756657,-1.220195,weight_loss,veg


### Encoding Categorical Data

In [355]:
from sklearn.preprocessing import LabelEncoder

In [356]:
le_goal = LabelEncoder()
le_diet = LabelEncoder()

In [357]:
df["goal_tag"] = le_goal.fit_transform(df["goal_tag"])
df["diet_type"] = le_diet.fit_transform(df["diet_type"])

In [358]:
df.head()

Unnamed: 0,Category,Description,carbs,protein,fat,fiber,calories,goal_tag,diet_type
0,milk,"milk, nfs",-0.803888,-0.708719,-0.802787,-0.756657,-1.161451,4,1
1,milk,"milk, whole",-0.812894,-0.715474,-0.657086,-0.756657,-1.088393,4,1
2,milk,"milk, low sodium, whole",-0.82235,-0.735738,-0.625779,-0.756657,-1.082608,4,1
3,milk,"milk, calcium fortified, whole",-0.812894,-0.715474,-0.657086,-0.756657,-1.088393,4,1
4,milk,"milk, calcium fortified, low fat (1%)",-0.789478,-0.704216,-0.928017,-0.756657,-1.220195,4,1


In [359]:
df.to_csv("dataset2_preprocessing_done.csv")