<a href="https://colab.research.google.com/github/RifatMuhtasim/Data_Science_Workflow/blob/main/3.5.Encode_and_Scaled_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Encode categorical variables

In [None]:
# One Hot Encoding

df = pd.get_dummies(data=df, columns=['Country Name'], drop_first=True)

In [None]:
# Label Encoding (only use in label)
from sklearn.preprocessing import LabelEncoder

country_label_encoder = LabelEncoder()
country_label_encoder.fit(sample_submission['Country Name'])
train['Country Name'] = country_label_encoder.transform(train['Country Name'])
test['Country Name'] = country_label_encoder.transform(test['Country Name'])

In [None]:
# Ordinal Encoding (For Features)
from sklearn.preprocessing import OrdinalEncoder

category_order = ["Low", "Medium", "High"]
category_encoder = OrdinalEncoder(categories=[category_order])
category_encoder.fit(sample_submission['pay'])
train['pay'] = category_encoder.transform(train['pay'])
test['pay'] = category_encoder.transform(test['pay'])

# 2. Handling Class Imbalance (for Classification)

In [None]:
# Handle Imbalance dataset using SMOTE
from imblearn.over_sampling import SMOTE

X = df.drop(['output'], axis="columns")
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

smote = SMOTE(sampling_strategy="minority")
X_train_smote,  y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Undersampling

df_churn0 = df[df['Churn'] == 0]
df_churn1 = df[df['Churn'] == 1]
df_under0 = df_churn0.sample(len(df_churn1))
df = pd.concat([df_churn1, df_under0], axis="rows")
df['Churn'].value_counts()

In [None]:
# Oversampling
from imblearn.combine import SMOTETmoek

X = df.drop(['output'], axis="columns")
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

smk = SMOTETmoek()
X_train_smk, y_train_res = smk.fit_sample(X_train, y_train)

# 3. Scaled Numerical Features

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df.drop(['output'], axis="columns")
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train_scaled =  standard_scaler.transform(X_train)
X_test_scaled = standard_scaler.transform(X_test)

This method scales the features so that they have a mean of 0 and a standard deviation of 1. It preserves the shape of the original distribution but centers it around 0. <br>
Formula: $z=\frac{x-mean}{std}$

In [None]:
# Normalization - MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X = df.drop(['output'], axis="columns")
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(X_train)
X_train_scaled =  minmax_scaler.transform(X_train)
X_test_scaled = minmax_scaler.transform(X_test)

This method scales the features to a fixed range, typically between 0 and 1. It's useful when the data needs to be bound within a specific range. <br>
Formula: $z=\frac{x-x_{min}}{x_{max} - x_{min}}$

In [None]:
# RobustScaler
from sklearn.preprocessing import RobustScaler

X = df.drop(['output'], axis="columns")
y = df['output']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled =  scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)