In [30]:
import numpy as np
import pandas as pd

# Setting random seed
np.random.seed(543)
n = 1000  # dataset size


gender = np.random.choice(['Male', 'Female'], size=n)
age = np.random.randint(low=25, high=65, size=n)
contract = np.random.choice(['Monthly', 'Quarterly', 'Half-Yearly', 'Yearly'], size=n)
monthly_charges = np.random.normal(loc=1000, scale=100, size=n)
tenure = np.random.randint(low=12, high=36, size=n)
churn = np.random.choice([1, 0], p=[0.2, 0.8], size=n)
region = np.random.choice(['North', 'South', 'East', 'West'], size=n)



# Combine everything into a DataFrame
df = pd.DataFrame({
    'Gender': gender,
    'Age': age,
    'Contract': contract,
    'MonthlyCharges': monthly_charges,
    'Tenure': tenure,
    'Region': region,

    'Churn': churn
})

df.head()


Unnamed: 0,Gender,Age,Contract,MonthlyCharges,Tenure,Region,Churn
0,Female,34,Yearly,1042.202157,25,East,1
1,Female,36,Half-Yearly,966.337735,12,East,1
2,Female,30,Yearly,1165.040177,16,West,0
3,Female,44,Quarterly,1002.266319,35,South,0
4,Male,53,Half-Yearly,1043.851952,16,South,1


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          1000 non-null   object 
 1   Age             1000 non-null   int64  
 2   Contract        1000 non-null   object 
 3   MonthlyCharges  1000 non-null   float64
 4   Tenure          1000 non-null   int64  
 5   Region          1000 non-null   object 
 6   Churn           1000 non-null   int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 54.8+ KB


In [32]:
X = df.drop(labels = ['Churn'], axis = 1)
y = df['Churn']

In [33]:
X,y

(     Gender  Age     Contract  MonthlyCharges  Tenure Region
 0    Female   34       Yearly     1042.202157      25   East
 1    Female   36  Half-Yearly      966.337735      12   East
 2    Female   30       Yearly     1165.040177      16   West
 3    Female   44    Quarterly     1002.266319      35  South
 4      Male   53  Half-Yearly     1043.851952      16  South
 ..      ...  ...          ...             ...     ...    ...
 995  Female   59       Yearly     1065.235146      30  North
 996  Female   38    Quarterly      897.817379      17  North
 997    Male   26  Half-Yearly     1235.950983      34  North
 998    Male   64      Monthly      974.746370      12  North
 999    Male   45    Quarterly      946.033836      23   West
 
 [1000 rows x 6 columns],
 0      1
 1      1
 2      0
 3      0
 4      1
       ..
 995    0
 996    0
 997    0
 998    0
 999    0
 Name: Churn, Length: 1000, dtype: int64)

In [34]:
# performing ohe on categorical columns

cat_columns = df.select_dtypes(exclude = 'number').columns
num_columns = df.select_dtypes(include = 'number').columns

In [35]:
cat_columns

Index(['Gender', 'Contract', 'Region'], dtype='object')

In [36]:
cat_columns = df.select_dtypes(exclude = 'number').columns
num_columns = df.select_dtypes(include = 'number').columns

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output = False, drop='first')

encoded_data = ohe.fit_transform(df[cat_columns])

encoded_data = pd.DataFrame(encoded_data, columns = ohe.get_feature_names_out(cat_columns))



In [37]:
encoded_data

Unnamed: 0,Gender_Male,Contract_Monthly,Contract_Quarterly,Contract_Yearly,Region_North,Region_South,Region_West
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0
996,0.0,0.0,1.0,0.0,1.0,0.0,0.0
997,1.0,0.0,0.0,0.0,1.0,0.0,0.0
998,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [40]:
df.drop(cat_columns, axis=1, inplace=True)
df = pd.concat([df, encoded_data], axis=1)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 1000 non-null   int64  
 1   MonthlyCharges      1000 non-null   float64
 2   Tenure              1000 non-null   int64  
 3   Churn               1000 non-null   int64  
 4   Gender_Male         1000 non-null   float64
 5   Contract_Monthly    1000 non-null   float64
 6   Contract_Quarterly  1000 non-null   float64
 7   Contract_Yearly     1000 non-null   float64
 8   Region_North        1000 non-null   float64
 9   Region_South        1000 non-null   float64
 10  Region_West         1000 non-null   float64
dtypes: float64(8), int64(3)
memory usage: 86.1 KB


<br>

## **Ordinal Encoding**

In [44]:
import pandas as pd

# Create a simple dataset
data = {
    'Color': ['Red', 'Blue', 'Green', 'Blue', 'Red'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small'],
    'Price': [10, 15, 20, 25, 30]
}

df = pd.DataFrame(data)
df


Unnamed: 0,Color,Size,Price
0,Red,Small,10
1,Blue,Medium,15
2,Green,Large,20
3,Blue,Medium,25
4,Red,Small,30


In [46]:
from sklearn.preprocessing import OrdinalEncoder

# Define order for the 'Size' column
size_order = [['Small', 'Medium', 'Large']]

oe = OrdinalEncoder(categories=size_order)

# Encode 'Size' column only (Color stays as is)
df['Size_encoded'] = oe.fit_transform(df[['Size']])

df


Unnamed: 0,Color,Size,Price,Size_encoded
0,Red,Small,10,0.0
1,Blue,Medium,15,1.0
2,Green,Large,20,2.0
3,Blue,Medium,25,1.0
4,Red,Small,30,0.0


<br>

## **Label Encoding**

In [47]:
import pandas as pd

data = {
    'City': ['Delhi', 'Mumbai', 'Kolkata', 'Delhi', 'Chennai'],
    'Experience_Level': ['Junior', 'Mid', 'Senior', 'Mid', 'Junior'],
    'Salary': [40000, 60000, 90000, 65000, 42000]  # Target variable
}

df = pd.DataFrame(data)
df

Unnamed: 0,City,Experience_Level,Salary
0,Delhi,Junior,40000
1,Mumbai,Mid,60000
2,Kolkata,Senior,90000
3,Delhi,Mid,65000
4,Chennai,Junior,42000


In [48]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Encode the 'City' column
df['City_encoded'] = le.fit_transform(df['City'])

df

Unnamed: 0,City,Experience_Level,Salary,City_encoded
0,Delhi,Junior,40000,1
1,Mumbai,Mid,60000,3
2,Kolkata,Senior,90000,2
3,Delhi,Mid,65000,1
4,Chennai,Junior,42000,0


### **Target Encoding**

In [50]:
# Compute mean salary per city
target_mean = df.groupby('City')['Salary'].mean()

# Map the mean values to each city
df['City_target_encoded'] = df['City'].map(target_mean)

df

Unnamed: 0,City,Experience_Level,Salary,City_encoded,City_target_encoded
0,Delhi,Junior,40000,1,52500.0
1,Mumbai,Mid,60000,3,60000.0
2,Kolkata,Senior,90000,2,90000.0
3,Delhi,Mid,65000,1,52500.0
4,Chennai,Junior,42000,0,42000.0
