<h1>Mounting the Google Drive</h1>

In [1]:
from  google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


<h1>1. Importing Necessary Modules</h1>

In [4]:
import numpy as np
import pandas as pd

<h1>2. Synthetic Dataset</h1>

In [6]:
seed = 321
np.random.seed(seed)
target = list(np.random.randint(0, 2, 20))

In [7]:
genre = ["Sci Fi", "Drama", "Romance", "Fantasy", "Nonfiction"]

In [8]:
np.random.seed(seed)
genres = [genre[i] for i in np.random.randint(0, len(genre), 20)]

In [9]:
df = pd.DataFrame({"genre" : genres, "target" : target})
df

Unnamed: 0,genre,target
0,Nonfiction,0
1,Romance,0
2,Nonfiction,0
3,Drama,1
4,Sci Fi,1
5,Drama,0
6,Sci Fi,1
7,Romance,0
8,Sci Fi,0
9,Nonfiction,0


# One Hot Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)

# column names are (annoyingly) lost after OneHotEncoding
# (i.e. the dataframe is converted to a numpy ndarray)
output = encoder.fit_transform(df[["genre"]])

# Get the feature names
feature_names = encoder.get_feature_names_out(["genre"])

# Create a DataFrame with one-hot encoded columns and proper column names
output_df = pd.DataFrame(output, columns=feature_names, index=df.index)

output_df = pd.concat([df[["genre"]], output_df], axis=1)

output_df

Unnamed: 0,genre,genre_Fantasy,genre_Nonfiction,genre_Romance,genre_Sci Fi
0,Nonfiction,0.0,1.0,0.0,0.0
1,Romance,0.0,0.0,1.0,0.0
2,Nonfiction,0.0,1.0,0.0,0.0
3,Drama,0.0,0.0,0.0,0.0
4,Sci Fi,0.0,0.0,0.0,1.0
5,Drama,0.0,0.0,0.0,0.0
6,Sci Fi,0.0,0.0,0.0,1.0
7,Romance,0.0,0.0,1.0,0.0
8,Sci Fi,0.0,0.0,0.0,1.0
9,Nonfiction,0.0,1.0,0.0,0.0


# Label Encoding

In [9]:
# Applying label encoding via the sklearn way
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder for genre
label_encoder = LabelEncoder()

output = pd.DataFrame()
# Apply label encoding to 'genre' column
output['genre_encoded'] = label_encoder.fit_transform(df['genre'])

output_df = pd.concat([df[["genre"]], output], axis=1)
# Print the output DataFrame with both original and encoded 'genre' columns
output_df

Unnamed: 0,genre,genre_encoded
0,Nonfiction,2
1,Romance,3
2,Nonfiction,2
3,Drama,0
4,Sci Fi,4
5,Drama,0
6,Sci Fi,4
7,Romance,3
8,Sci Fi,4
9,Nonfiction,2


In [10]:
genre_encoding_mapping = output_df.groupby(['genre']).agg({'genre_encoded': 'first'})
genre_encoding_mapping

Unnamed: 0_level_0,genre_encoded
genre,Unnamed: 1_level_1
Drama,0
Fantasy,1
Nonfiction,2
Romance,3
Sci Fi,4


In [11]:
# Performing Label Encoding Manually (the right way)
genre_ranking = {"Sci Fi": 5, "Nonfiction": 4, "Fantasy": 3, "Drama": 2, "Romance": 1}

# Apply manual label encoding based on the custom ranking
output_df['genre_encoded_manual'] = df['genre'].map(genre_ranking)

# Print the output DataFrame with manual label encoding
output_df

Unnamed: 0,genre,genre_encoded,genre_encoded_manual
0,Nonfiction,2,4
1,Romance,3,1
2,Nonfiction,2,4
3,Drama,0,2
4,Sci Fi,4,5
5,Drama,0,2
6,Sci Fi,4,5
7,Romance,3,1
8,Sci Fi,4,5
9,Nonfiction,2,4


In [12]:
genre_encoding_mapping = output_df.groupby(['genre']).agg({'genre_encoded_manual': 'first'})
genre_encoding_mapping

Unnamed: 0_level_0,genre_encoded_manual
genre,Unnamed: 1_level_1
Drama,2
Fantasy,3
Nonfiction,4
Romance,1
Sci Fi,5


# Target Encoding

## Using Probabilities
### Calculating only the posteriors, by calculating the conditional probability

In [None]:
categories = df['genre'].unique()
targets = df['target'].unique()
cat_list = []
for cat in categories:
    aux_dict = {}
    aux_dict['category'] = cat
    aux_df = df[df['genre'] == cat]
    counts = aux_df['target'].value_counts()
    aux_dict['count'] = sum(counts)
    for t in targets:
        aux_dict['target_' + str(t)] = counts[t]
    cat_list.append(aux_dict)

In [None]:
cat_list = pd.DataFrame(cat_list)

In [None]:
cat_list['genre_encoded_prob'] = cat_list['target_1'] / cat_list['count']

In [None]:
cat_list

Unnamed: 0,category,count,target_0,target_1,genre_encoded_prob
0,Nonfiction,4,3,1,0.25
1,Romance,5,4,1,0.2
2,Drama,3,1,2,0.666667
3,Sci Fi,4,2,2,0.5
4,Fantasy,4,1,3,0.75


In [None]:
df = df.join(cat_list.drop(columns = ['count', 'target_1', 'target_0']).set_index('category'), on = 'genre', how = 'left')
df

Unnamed: 0,genre,target,genre_encoded_prob
0,Nonfiction,0,0.25
1,Romance,0,0.2
2,Nonfiction,0,0.25
3,Drama,1,0.666667
4,Sci Fi,1,0.5
5,Drama,0,0.666667
6,Sci Fi,1,0.5
7,Romance,0,0.2
8,Sci Fi,0,0.5
9,Nonfiction,0,0.25


## Using the mean
### Calculating only the posteriors

In [None]:
stats = df['target'].groupby(df['genre']).agg(['count', 'mean'])
stats

Unnamed: 0_level_0,count,mean
genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Drama,3,0.666667
Fantasy,4,0.75
Nonfiction,4,0.25
Romance,5,0.2
Sci Fi,4,0.5


In [None]:
df = df.join(stats.drop(columns = 'count'), on = 'genre', how = 'left').rename(columns = {'mean'  : 'genre_encoded_mean'})
df

Unnamed: 0,genre,target,genre_encoded_prob,genre_encoded_mean
0,Nonfiction,0,0.25,0.25
1,Romance,0,0.2,0.2
2,Nonfiction,0,0.25,0.25
3,Drama,1,0.666667,0.666667
4,Sci Fi,1,0.5,0.5
5,Drama,0,0.666667,0.666667
6,Sci Fi,1,0.5,0.5
7,Romance,0,0.2,0.2
8,Sci Fi,0,0.5,0.5
9,Nonfiction,0,0.25,0.25


## With Smoothing
### sklearn method uses some smoothing based on the prior

In [None]:
smoothing_factor = 1.0
min_samples_leaf = 1

In [None]:
prior = df['target'].mean()

In [None]:
smoove = 1 / (1 + np.exp(-(stats['count'] - min_samples_leaf) / smoothing_factor))

In [None]:
smoothing = prior * (1 - smoove) + stats['mean'] * smoove

In [None]:
encoded = pd.Series(smoothing, name = 'genre_encoded_smoothing')
encoded

genre
Drama         0.640839
Fantasy       0.735772
Nonfiction    0.259485
Romance       0.204497
Sci Fi        0.497629
Name: genre_encoded_smoothing, dtype: float64

In [None]:
df = df.join(encoded, on = 'genre', how = 'left')
df

Unnamed: 0,genre,target,genre_encoded_prob,genre_encoded_mean,genre_encoded_smoothing
0,Nonfiction,0,0.25,0.25,0.259485
1,Romance,0,0.2,0.2,0.204497
2,Nonfiction,0,0.25,0.25,0.259485
3,Drama,1,0.666667,0.666667,0.640839
4,Sci Fi,1,0.5,0.5,0.497629
5,Drama,0,0.666667,0.666667,0.640839
6,Sci Fi,1,0.5,0.5,0.497629
7,Romance,0,0.2,0.2,0.204497
8,Sci Fi,0,0.5,0.5,0.497629
9,Nonfiction,0,0.25,0.25,0.259485


## Sklearn category_encoders
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.TargetEncoder.html

In [13]:
# Upgrade Sklearn
!pip install scikit-learn --upgrade

Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.1.post1


In [1]:
from sklearn.preprocessing import TargetEncoder

In [2]:
encoder = TargetEncoder()

In [10]:
df['genre_encoded_sklearn'] = encoder.fit_transform(pd.DataFrame(df['genre']), df['target'].values)
df

Unnamed: 0,genre,target,genre_encoded_sklearn
0,Nonfiction,0,0.357433
1,Romance,0,0.0
2,Nonfiction,0,0.478947
3,Drama,1,0.478947
4,Sci Fi,1,0.478947
5,Drama,0,1.0
6,Sci Fi,1,0.357433
7,Romance,0,0.28
8,Sci Fi,0,0.628571
9,Nonfiction,0,0.478947


# Leave One Out Encoding

sklearn doesnt have a very good implementation of leave one out encoding so we go for another module

In [11]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
import pandas as pd
import numpy as np
from category_encoders import LeaveOneOutEncoder

# Set seed and create synthetic data
seed = 321
np.random.seed(seed)
target = list(np.random.randint(0, 2, 20))
genre = ["Sci Fi", "Drama", "Romance", "Fantasy", "Nonfiction"]
np.random.seed(seed)
genres = [genre[i] for i in np.random.randint(0, len(genre), 20)]
df = pd.DataFrame({"genre" : genres, "target" : target})

# Initialize LeaveOneOutEncoder
loo_encoder = LeaveOneOutEncoder(cols=['genre'])

# Fit and transform the data
df['genre_encoded'] = loo_encoder.fit_transform(df['genre'], df['target'])

df

Unnamed: 0,genre,target,genre_encoded
0,Nonfiction,0,0.333333
1,Romance,0,0.25
2,Nonfiction,0,0.333333
3,Drama,1,0.5
4,Sci Fi,1,0.333333
5,Drama,0,1.0
6,Sci Fi,1,0.333333
7,Romance,0,0.25
8,Sci Fi,0,0.666667
9,Nonfiction,0,0.333333


# K Fold Encodings
To implement K-fold encoding using scikit-learn (sklearn), you can use the KFold cross-validation splitter along with the GroupBy operation to calculate the mean target value for each group in each fold. Below is an example code to achieve this:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

# Set seed and create synthetic data
seed = 321
np.random.seed(seed)
target = list(np.random.randint(0, 2, 20))
genre = ["Sci Fi", "Drama", "Romance", "Fantasy", "Nonfiction"]
np.random.seed(seed)
genres = [genre[i] for i in np.random.randint(0, len(genre), 20)]
df = pd.DataFrame({"genre": genres, "target": target})

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'genre' column
df['genre_encoded'] = label_encoder.fit_transform(df['genre'])

# Initialize KFold with k=5 (5-fold cross-validation)
kfold = KFold(n_splits=5, shuffle=True, random_state=seed)

# Create a new column for the K-fold encoded values
df['genre_encoded_kfold'] = np.nan

# Apply K-fold encoding
for train_index, test_index in kfold.split(df):
    train_data = df.iloc[train_index]
    test_data = df.iloc[test_index]
    mean_target = train_data.groupby('genre_encoded')['target'].mean()
    df.loc[df.index.isin(test_data.index), 'genre_encoded_kfold'] = df.loc[df.index.isin(test_data.index), 'genre_encoded'].map(mean_target)

df

Unnamed: 0,genre,target,genre_encoded,genre_encoded_kfold
0,Nonfiction,0,2,0.333333
1,Romance,0,3,0.25
2,Nonfiction,0,2,0.333333
3,Drama,1,0,0.5
4,Sci Fi,1,4,0.5
5,Drama,0,0,1.0
6,Sci Fi,1,4,0.333333
7,Romance,0,3,0.0
8,Sci Fi,0,4,0.666667
9,Nonfiction,0,2,0.333333


In [None]:
# Group by 'genre'
genre_encoding_mapping = df.groupby(['genre']).agg({'genre_encoded': 'first'})
genre_encoding_mapping

Unnamed: 0_level_0,genre_encoded
genre,Unnamed: 1_level_1
Drama,0
Fantasy,1
Nonfiction,2
Romance,3
Sci Fi,4
