## Handle Categorical Features

### 1. One Hot Encoding

In [3]:
import pandas as pd

In [3]:
df=pd.read_csv('Titanic-Dataset.csv',usecols=['Sex','Embarked'])

In [4]:
df.dropna(inplace=True)

In [5]:
df.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [6]:
pd.get_dummies(df['Sex'],drop_first=True).astype(int).head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


#### One hot encoding with many categories in a feature

In [7]:
df=pd.read_csv('mercedes.csv',usecols=["X0","X1","X2","X3","X4","X5","X6"])

In [8]:
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


- Find most frequent values in X1 column
- Take top 10 values from that and create separate column like get dummies

In [9]:
# Top 10 values that most repeated in X1 column
lst_10=df.X1.value_counts().head(10).index
lst_10=list(lst_10)
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [10]:
# create separate column for each value in top10
import numpy as np
for categories in lst_10:
    df[categories]=np.where(df['X1']==categories,1,0)

In [11]:
df.head(10)

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,aa,s,b,l,v,r,i,a,c,o
0,k,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
5,t,b,e,c,d,g,h,0,0,1,0,0,0,0,0,0,0
6,al,r,e,f,d,f,h,0,0,0,0,0,1,0,0,0,0
7,o,l,as,f,d,f,j,0,0,0,1,0,0,0,0,0,0
8,w,s,as,e,d,f,i,0,1,0,0,0,0,0,0,0,0
9,j,b,aq,c,d,f,a,0,0,1,0,0,0,0,0,0,0


### 2. Ordinal Number Encoding

In [6]:
from sklearn.preprocessing import OrdinalEncoder

# Sample DataFrame
data = pd.DataFrame({
    'Education': ['High School', 'Bachelor\'s', 'Master\'s', 'PhD', 'Bachelor\'s']
})
# Define the order of categories
education_order = ['High School', 'Bachelor\'s', 'Master\'s', 'PhD']

# Apply ordinal encoding
encoder = OrdinalEncoder(categories=[education_order])
data['Education_Encoded'] = encoder.fit_transform(data[['Education']])

data

Unnamed: 0,Education,Education_Encoded
0,High School,0.0
1,Bachelor's,1.0
2,Master's,2.0
3,PhD,3.0
4,Bachelor's,1.0


### 2.1 : Ordinal Encoding with map

In [13]:
result = {'High School':0, 'Bachelor\'s':1, 'Master\'s':2, 'PhD':3}
data['Education_Encoded_Map'] = data['Education'].map(result)
data.head()

Unnamed: 0,Education,Education_Encoded,Education_Encoded_Map
0,High School,0.0,0
1,Bachelor's,1.0,1
2,Master's,2.0,2
3,PhD,3.0,3
4,Bachelor's,1.0,1


#### 2.2 : Generating a DataFrame with Dates and Ordinal Weekday Encoding Using List Comprehension

In [7]:
import datetime

In [8]:
# List Comprehension
# List of dates from today back 14 days
today_date = datetime.datetime.today()
days = [today_date-datetime.timedelta(x) for x in range(15)]

data = pd.DataFrame(days) # convert it into DataFrame
data.columns=["Day"]

# Extract the weekday name and store it in WeekDay column
data['WeekDay'] = data['Day'].dt.day_name()
data.head()

Unnamed: 0,Day,WeekDay
0,2024-09-02 21:23:45.765053,Monday
1,2024-09-01 21:23:45.765053,Sunday
2,2024-08-31 21:23:45.765053,Saturday
3,2024-08-30 21:23:45.765053,Friday
4,2024-08-29 21:23:45.765053,Thursday


- Extract day of the week as an integer (1 for Monday to 7 for Sunday).
- Store in weekday_ordinal column

In [9]:
data["WeekDay_Ordinal"] = data['Day'].dt.weekday+1
data.head()

Unnamed: 0,Day,WeekDay,WeekDay_Ordinal
0,2024-09-02 21:23:45.765053,Monday,1
1,2024-09-01 21:23:45.765053,Sunday,7
2,2024-08-31 21:23:45.765053,Saturday,6
3,2024-08-30 21:23:45.765053,Friday,5
4,2024-08-29 21:23:45.765053,Thursday,4


### 3. Count Or Frequent Encodng

In [10]:
df = pd.read_csv("adult.csv")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [13]:
# Take country column 
country_map = df['country'].value_counts().to_dict()

In [14]:
df['country'] = df['country'].map(country_map)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,29170,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,29170,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,29170,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,29170,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,95,<=50K


#### Advantages
1. Easy to use
2. Not increasing fature space

#### Disadvantages
1. it will provide same weight if the frequencies are same

### 4. Target Guided Ordinal Encoding
1. Ordering the lables according to the target
2. Replace the labels by the joint probability of being 1 or 0

In [15]:
df=pd.read_csv('Titanic-Dataset.csv',usecols=['Cabin','Survived'])

In [16]:
df['Cabin'].fillna('Missing',inplace =True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('Missing',inplace =True)


In [17]:
# Extract the first character from the 'Cabin' column
df['Cabin'] = df['Cabin'].str[0]

In [18]:
df['Cabin'].value_counts()

Cabin
M    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: count, dtype: int64

In [19]:
df.groupby(['Cabin'])['Survived'].mean().sort_values()

Cabin
T    0.000000
M    0.299854
A    0.466667
G    0.500000
C    0.593220
F    0.615385
B    0.744681
E    0.750000
D    0.757576
Name: Survived, dtype: float64

In [21]:
# Determine the orders cabin based on mean survival rates
ordinal_labels = df.groupby(['Cabin'])['Survived'].mean().sort_values().index
ordinal_labels

Index(['T', 'M', 'A', 'G', 'C', 'F', 'B', 'E', 'D'], dtype='object', name='Cabin')

In [22]:
# Apply ordinal encoding
encoder = OrdinalEncoder(categories=[ordinal_labels])
df['Cabin_encode'] = encoder.fit_transform(df[['Cabin']])


In [23]:
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encode
0,0,M,1.0
1,1,C,4.0
2,1,M,1.0
3,1,C,4.0
4,0,M,1.0


### 5. Probability Ratio Encoding
1. Probability of Survived based on Cabin--- Categorical Feature
2. Probability of Not Survived---1-pr(Survived)
3. pr(Survived)/pr(Not Survived)
4. Dictonary to map cabin with probability
5. replace with the categorical feature

In [27]:
# calculates the mean of the Survived column for each cabin group.
prob_df=df.groupby(['Cabin'])['Survived'].mean()

# Conversion to DataFrame:
prob_df=pd.DataFrame(prob_df)
# Calculation of Died Column
prob_df['Died']=1-prob_df['Survived']
prob_df.head()

Unnamed: 0_level_0,Survived,Died
Cabin,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0.466667,0.533333
B,0.744681,0.255319
C,0.59322,0.40678
D,0.757576,0.242424
E,0.75,0.25


In [28]:
# Calculate the 'Probability_ratio' as the ratio of 'Survived' to 'Died'
prob_df['Probability_ratio']=prob_df['Survived']/prob_df['Died']

# Convert Probability_ratio column to a dictionary
probability_encoded = prob_df['Probability_ratio'].to_dict()

# Map the Probability_ratio back to the original df using the Cabin column
df['Cabin_Probability_encoded'] = df['Cabin'].map(probability_encoded)
df.head()

Unnamed: 0,Survived,Cabin,Cabin_encode,Cabin_Probability_encoded
0,0,M,1.0,0.428274
1,1,C,4.0,1.458333
2,1,M,1.0,0.428274
3,1,C,4.0,1.458333
4,0,M,1.0,0.428274
