In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [127]:
df = pd.read_csv('/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/customer.csv')

In [128]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


## Ordinal Encoding

In [129]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories=[['Poor','Average','Good']])

df['review'] = oe.fit_transform(df[['review']])

In [130]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,1.0,School,No
1,68,Female,0.0,UG,No
2,70,Female,2.0,PG,No
3,72,Female,2.0,PG,No
4,16,Female,1.0,UG,No


In [131]:
df['education'].value_counts()

PG        18
School    16
UG        16
Name: education, dtype: int64

In [132]:
df['gender'].value_counts()

Female    29
Male      21
Name: gender, dtype: int64

In [133]:
## apply for two columns at a time

category = [
    
    ['Male','Female'],
    ['School','UG','PG']

]

oe1 = OrdinalEncoder(categories=category)

In [134]:
df[['gender','education']] = oe1.fit_transform(df[['gender','education']])

In [135]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,1.0,1.0,0.0,No
1,68,1.0,0.0,1.0,No
2,70,1.0,2.0,2.0,No
3,72,1.0,2.0,2.0,No
4,16,1.0,1.0,1.0,No


## Label encoding

In [136]:
df = pd.read_csv('/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/customer.csv')

In [137]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [138]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['purchased'] = le.fit_transform(df['purchased'])

In [139]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
15,75,Male,Poor,UG,0
39,76,Male,Poor,PG,0
46,64,Female,Poor,PG,0
47,38,Female,Good,PG,1
1,68,Female,Poor,UG,0


## One-Hot Encoder

In [140]:
df = pd.read_csv('/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/customer.csv')

In [141]:
df.head()

Unnamed: 0,age,gender,review,education,purchased
0,30,Female,Average,School,No
1,68,Female,Poor,UG,No
2,70,Female,Good,PG,No
3,72,Female,Good,PG,No
4,16,Female,Average,UG,No


In [142]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first',sparse=False)

df['gender'] = ohe.fit_transform(df[['gender']])

In [143]:
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
26,53,0.0,Poor,PG,No
10,98,0.0,Good,UG,Yes
27,69,0.0,Poor,PG,No
17,22,0.0,Poor,UG,Yes
33,89,0.0,Good,PG,Yes


## Column Transfer

In [144]:
df = pd.read_csv('/home/suhas/Desktop/office/Data Science/Notes/Data Analysis/Data/covid_toy.csv')

In [145]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [146]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        100 non-null    int64  
 1   gender     100 non-null    object 
 2   fever      90 non-null     float64
 3   cough      100 non-null    object 
 4   city       100 non-null    object 
 5   has_covid  100 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.8+ KB


In [147]:
df.isnull().mean() * 100

age           0.0
gender        0.0
fever        10.0
cough         0.0
city          0.0
has_covid     0.0
dtype: float64

In [148]:
df['cough'].value_counts()

Mild      62
Strong    38
Name: cough, dtype: int64

In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder
from sklearn.impute import SimpleImputer

In [150]:
transformer = ColumnTransformer(
    transformers= [
        ('tnf1',SimpleImputer(strategy='mean'),['fever']),
        ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
        ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
    ],remainder='passthrough'

)

transformed_data = transformer.fit_transform(df)

In [157]:
transformer.named_transformers_['tnf1']


SimpleImputer()

In [151]:
transformed_df = pd.DataFrame(transformed_data)

In [152]:
transformed_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,103,0,1,0,1,0,60,No
1,100,0,1,1,0,0,27,Yes
2,101,0,1,1,0,0,42,No
3,98,0,0,0,1,0,31,No
4,101,0,0,0,0,1,65,No
...,...,...,...,...,...,...,...,...
95,104,0,0,0,0,0,12,No
96,101,1,0,0,1,0,51,Yes
97,101,0,0,0,0,0,20,No
98,98,1,0,0,0,1,5,No


In [153]:
le = LabelEncoder()

transformed_df.iloc[:,-1] = le.fit_transform(transformed_df.iloc[:,-1])

In [154]:
transformed_df

Unnamed: 0,0,1,2,3,4,5,6,7
0,103,0,1,0,1,0,60,0
1,100,0,1,1,0,0,27,1
2,101,0,1,1,0,0,42,0
3,98,0,0,0,1,0,31,0
4,101,0,0,0,0,1,65,0
...,...,...,...,...,...,...,...,...
95,104,0,0,0,0,0,12,0
96,101,1,0,0,1,0,51,1
97,101,0,0,0,0,0,20,0
98,98,1,0,0,0,1,5,0


In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Sample DataFrame
data = {
    'sex': ['male', 'female', 'female', 'male', 'female'],
    'embarked': ['S', 'C', 'Q', 'S', 'C'],
    'embark_town': ['Southampton', 'Cherbourg', 'Queenstown', 'Southampton', 'Cherbourg']
}

df = pd.DataFrame(data)

# Step 1: Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')

# Step 2: Fit and transform the encoder on the specified columns
one_hot_encoded = one_hot_encoder.fit_transform(df[['sex', 'embarked', 'embark_town']])

# Step 3: Get the encoded column names (use get_feature_names() instead of get_feature_names_out())
encoded_columns = one_hot_encoder.get_feature_names(['sex', 'embarked', 'embark_town'])

# Step 4: Convert the encoded array into a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=encoded_columns)

# Reset the index of both DataFrames
df.reset_index(drop=True, inplace=True)
one_hot_encoded_df.reset_index(drop=True, inplace=True)

# Step 5: Concatenate with the original DataFrame (excluding the original categorical columns)
df_transformed = pd.concat([df.drop(columns=['sex', 'embarked', 'embark_town']), one_hot_encoded_df], axis=1)

# Display the final DataFrame
print(df_transformed)


   sex_male  embarked_Q  embarked_S  embark_town_Queenstown  \
0       1.0         0.0         1.0                     0.0   
1       0.0         0.0         0.0                     0.0   
2       0.0         1.0         0.0                     1.0   
3       1.0         0.0         1.0                     0.0   
4       0.0         0.0         0.0                     0.0   

   embark_town_Southampton  
0                      1.0  
1                      0.0  
2                      0.0  
3                      1.0  
4                      0.0  


In [2]:
df_transformed

Unnamed: 0,sex_male,embarked_Q,embarked_S,embark_town_Queenstown,embark_town_Southampton
0,1.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0
3,1.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0


## Applying  Multiple Operations at one time

In [None]:
# Example take penguins dataset seaborn

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Initialize OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse=False)

# Create the ColumnTransformer to apply OneHotEncoder to both 'island' and 'sex' columns
column_transformer = ColumnTransformer(
    transformers=[
        ('island', ohe, ['island']),
        ('sex', ohe, ['sex'])
    ], 
    remainder='passthrough'  # Keeps the other columns intact
)

# Apply the transformation
df_transformed = column_transformer.fit_transform(df)

# Access the transformed column names after fitting the ColumnTransformer
island_columns = column_transformer.transformers_[0][1].get_feature_names(['island'])
sex_columns = column_transformer.transformers_[1][1].get_feature_names(['sex'])

# Combine the new column names (from OneHotEncoder) with the remaining original column names
# 'remainder' columns are in the same order as in the original DataFrame
remainder_columns = [col for col in df.columns if col not in ['island', 'sex']]
columns = island_columns.tolist() + sex_columns.tolist() + remainder_columns

# Convert the transformed data to a DataFrame with appropriate column names
df_final = pd.DataFrame(df_transformed, columns=columns)

# Replace the original DataFrame with the transformed one
df = df_final


## Applying one columns at a time

In [None]:
from sklearn.preprocessing import OneHotEncoder

sex_ohe = OneHotEncoder(drop='first',sparse=False)

sex_encode = sex_ohe.fit_transform(df[['sex']])

sex_df = pd.DataFrame(sex_encode,columns=sex_ohe.get_feature_names(['sex']))

df = pd.concat([df,sex_df],axis=1)

df.drop(columns='sex',inplace=True)

print(df)


## for another column

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first',sparse=False)

island_encoded = ohe.fit_transform(df[['island']])

island_encoded_df = pd.DataFrame(island_encoded,columns=ohe.get_feature_names(['island']))

df = pd.concat([df,island_encoded_df],axis=1)

df.drop(columns='island',inplace=True)

print(df)