In [13]:
import numpy as np 
import pandas as pd

In [85]:
data = {
    "Age": [25, 30, 22, 28, 35, 27, 40],
    "Gender": ["Male", "Female", "Male", "Female", "Male", "Female", "Male"],
    "Country": ["Pakistan", "Pakistan", "USA", "Pakistan", "USA", "Pakistan", "USA"],
    "Education_Level": ["High School", "Bachelors", "Masters", "Bachelors", "PhD", "Masters", "High School"],
    "Income": [50000, 60000, 55000, 52000, 65000, 58000, 70000],
    "Purchased": ["No", "Yes", "No", "Yes", "Yes", "No", "Yes"]
}

In [86]:
df = pd.DataFrame(data)

In [16]:
df.sample(5)

Unnamed: 0,Age,Gender,Country,Education_Level,Income,Purchased
3,28,Female,Pakistan,Bachelors,52000,Yes
1,30,Female,Pakistan,Bachelors,60000,Yes
2,22,Male,USA,Masters,55000,No
4,35,Male,USA,PhD,65000,Yes
5,27,Female,Pakistan,Masters,58000,No


In [17]:
df['Country'].value_counts()

Country
Pakistan    4
USA         3
Name: count, dtype: int64

In [23]:
df.isna().sum()

Age                0
Gender             0
Country            0
Education_Level    0
Income             0
Purchased          0
dtype: int64

In [24]:
df.duplicated().sum()

np.int64(0)

# Applying Traditional method to encode each column 

In [None]:
#  Age       :-  No need of Encoding
# Gender     :-  One Hot Encoding 
# Country    :-  One Hot Encoding
# Education_Level :- Ordinal Encoding
# Purchased   :- Label Encoding

In [26]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [47]:
le = LabelEncoder()
OE = OrdinalEncoder(categories=[["High School", "Bachelors", "Masters", "PhD"]])
OHE = OneHotEncoder(drop='first', sparse_output=False)

In [59]:
# Applying Label Encoding on "Purchased" column
purchased = le.fit_transform(df['Purchased'])
purchased

array([0, 1, 0, 1, 1, 0, 1])

In [48]:
# Applying Ordinal Encoding on "Education_Level" column
education_level = OE.fit_transform(df[['Education_Level']])
education_level

array([[0.],
       [1.],
       [2.],
       [1.],
       [3.],
       [2.],
       [0.]])

In [68]:
# Applying One-Hot-Encoding to "Gender and Country" columns
gender = OHE.fit_transform(df[['Country']])
country = OHE.fit_transform(df[['Gender']])

In [51]:
# gender_country

In [75]:

df_transformed = np.column_stack((
     df['Age'],
     education_level,
     df['Income'],
     gender,
     country,
     purchased,
 ))


In [78]:
transformed_DF = pd.DataFrame(df_transformed, columns=['Age','Gender','Education_Level','Income','Country','Purchased'])
transformed_DF

Unnamed: 0,Age,Gender,Education_Level,Income,Country,Purchased
0,25.0,0.0,0.0,50000.0,0.0,1.0
1,30.0,1.0,1.0,60000.0,0.0,0.0
2,22.0,0.0,2.0,55000.0,1.0,1.0
3,28.0,1.0,1.0,52000.0,0.0,0.0
4,35.0,1.0,3.0,65000.0,1.0,1.0
5,27.0,0.0,2.0,58000.0,0.0,0.0
6,40.0,1.0,0.0,70000.0,1.0,1.0


# Applying Column Transformer

In [87]:
from sklearn.compose import ColumnTransformer

In [92]:
transformer = ColumnTransformer(
    transformers=[
        ('transformer1', OneHotEncoder(drop='first',sparse_output=False), ['Country','Gender']), 
        ('transformer2', OrdinalEncoder(categories=[['High School','Bachelors','Masters','PhD']]), ['Education_Level']), 
    ], 
    remainder='passthrough'
)

In [96]:
newDf= pd.DataFrame(transformer.fit_transform(df),
                     columns=['Gender','Country','Education_Level','Age','Income','Purchased']
                   )

In [97]:
newDf.head(5)

Unnamed: 0,Gender,Country,Education_Level,Age,Income,Purchased
0,0.0,1.0,0.0,25,50000,No
1,0.0,0.0,1.0,30,60000,Yes
2,1.0,1.0,2.0,22,55000,No
3,0.0,0.0,1.0,28,52000,Yes
4,1.0,1.0,3.0,35,65000,Yes
