In [18]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd

In [20]:
data = pd.read_csv('data/datasets/sudarshan24byte/online-food-dataset/versions/1/onlinefoods.csv')

cleaned_data = data.copy()

In [21]:
cleaned_data.columns

Index(['Age', 'Gender', 'Marital Status', 'Occupation', 'Monthly Income',
       'Educational Qualifications', 'Family size', 'latitude', 'longitude',
       'Pin code', 'Output', 'Feedback', 'Unnamed: 12'],
      dtype='object')

In [22]:
new_columns = []

for i in cleaned_data.columns:
    currElm = i.split(" ")
    # Capitalize each word and join with underscore
    new_col = "_".join([c.capitalize() for c in currElm])
    new_columns.append(new_col)

cleaned_data.columns = new_columns

In [23]:
cleaned_data.rename(columns={'Unnamed:_12': 'Unnamed'}, inplace=True)

cleaned_data.columns

Index(['Age', 'Gender', 'Marital_Status', 'Occupation', 'Monthly_Income',
       'Educational_Qualifications', 'Family_Size', 'Latitude', 'Longitude',
       'Pin_Code', 'Output', 'Feedback', 'Unnamed'],
      dtype='object')

In [None]:
# cleaned_data[['Unnamed: 12']]

KeyError: "None of [Index(['Unnamed: 12'], dtype='object')] are in the [columns]"

In [28]:
columns_to_keep = ['Age', 'Gender', 'Marital_Status', 'Occupation', 'Monthly_Income',
       'Educational_Qualifications', 'Family_Size', 'Output', 'Feedback']

columns_to_drop = ['Latitude', 'Longitude',
       'Pin_Code', 'Unnamed']

print(f"Length of columns to keep: {len(columns_to_keep)}")
print(f"Length of columns to drop: {len(columns_to_drop)}")

Length of columns to keep: 9
Length of columns to drop: 4


In [29]:
cleaned_data.drop(columns=columns_to_drop, inplace=True)

cleaned_data.columns

Index(['Age', 'Gender', 'Marital_Status', 'Occupation', 'Monthly_Income',
       'Educational_Qualifications', 'Family_Size', 'Output', 'Feedback'],
      dtype='object')

In [30]:
cleaned_data.duplicated().sum()

np.int64(144)

In [31]:
cleaned_data.drop_duplicates(keep='first', inplace=True)

cleaned_data.duplicated().sum()

np.int64(0)

In [32]:
cleaned_data.reset_index()

Unnamed: 0,index,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_Size,Output,Feedback
0,0,20,Female,Single,Student,No Income,Post Graduate,4,Yes,Positive
1,1,24,Female,Single,Student,Below Rs.10000,Graduate,3,Yes,Positive
2,2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,Yes,Negative
3,3,22,Female,Single,Student,No Income,Graduate,6,Yes,Positive
4,4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...
239,355,21,Male,Single,Student,No Income,Graduate,2,No,Positive
240,363,31,Male,Married,Employee,More than 50000,Ph.D,3,Yes,Positive
241,369,30,Male,Married,Employee,More than 50000,Post Graduate,6,Yes,Positive
242,374,21,Male,Single,Student,No Income,Graduate,3,Yes,Negative


In [33]:
cleaned_data.isna().sum()

Age                           0
Gender                        0
Marital_Status                0
Occupation                    0
Monthly_Income                0
Educational_Qualifications    0
Family_Size                   0
Output                        0
Feedback                      0
dtype: int64

In [34]:
cleaned_data['Gender'] = cleaned_data['Gender'].apply(lambda x: 'F' if x == 'Female' else 'M')
cleaned_data['Gender']

0      F
1      F
2      M
3      F
4      M
      ..
355    M
363    M
369    M
374    M
386    M
Name: Gender, Length: 244, dtype: object

In [35]:
cleaned_data.head(3)

Unnamed: 0,Age,Gender,Marital_Status,Occupation,Monthly_Income,Educational_Qualifications,Family_Size,Output,Feedback
0,20,F,Single,Student,No Income,Post Graduate,4,Yes,Positive
1,24,F,Single,Student,Below Rs.10000,Graduate,3,Yes,Positive
2,22,M,Single,Student,Below Rs.10000,Post Graduate,3,Yes,Negative


In [38]:
cleaned_data['Family_Size'] = cleaned_data['Family_Size'].apply(lambda x: 'Small' if x < 3 else 'Large')
cleaned_data[['Family_Size']]

Unnamed: 0,Family_Size
0,Large
1,Large
2,Large
3,Large
4,Large
...,...
355,Small
363,Large
369,Large
374,Large


In [39]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 386
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         244 non-null    int64 
 1   Gender                      244 non-null    object
 2   Marital_Status              244 non-null    object
 3   Occupation                  244 non-null    object
 4   Monthly_Income              244 non-null    object
 5   Educational_Qualifications  244 non-null    object
 6   Family_Size                 244 non-null    object
 7   Output                      244 non-null    object
 8   Feedback                    244 non-null    object
dtypes: int64(1), object(8)
memory usage: 19.1+ KB


In [42]:
cleaned_data['Output'] = cleaned_data.Output.apply(lambda x: 1 if x == 'Yes' else 0)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 244 entries, 0 to 386
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Age                         244 non-null    int64 
 1   Gender                      244 non-null    object
 2   Marital_Status              244 non-null    object
 3   Occupation                  244 non-null    object
 4   Monthly_Income              244 non-null    object
 5   Educational_Qualifications  244 non-null    object
 6   Family_Size                 244 non-null    object
 7   Output                      244 non-null    int64 
 8   Feedback                    244 non-null    object
dtypes: int64(2), object(7)
memory usage: 19.1+ KB


In [46]:
import os

os.makedirs('./data/cleaned_data', exist_ok=True)

cleaned_data.to_csv("./data/cleaned_data/HR_Cleaned_Data.csv", index=False)
cleaned_data.to_json("./data/cleaned_data/HR_Cleaned_Data.json", index=False, indent=True)