<h1 style="color: cyan; text-align:center; font-size:250%; font-weight:bold;">
Transformers used in ML 
</h1>

<h2 style="color: lavender; text-align:left; font-size:130%; font-weight:bold;">
List of the Important Transformers Used in ML
</h2>

<ol style="font-size:120%;">
    <li>Column Transformer</li>
    <li>Pipeline</li>
    <li>Mathemetical Transformers</li>
    <li>Functional Transformers</li>
    <li>Power Transformers</li>
    <li>Quantile Transformer</li>
</ol>

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
1.Column Transformers 
</h1>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load dataset and select relevant columns
df=pd.read_csv("dummy_covid_data.csv")
df.sample(5)

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=0)

#COLUMN TRANSFORMER
transformer= ColumnTransformer(
    transformers=[
        ('tnf1', OrdinalEncoder(categories=[['mild', 'strong']]), ['cough']),  # Apply StandardScaler to numerical columns
        ('fnf2', OneHotEncoder(sparse_output=False, drop='first'),['gender', 'city'])  # Apply OneHotEncoder to categorical columns
    ]
    , remainder='passthrough'
)
# Fit the transformer into the training dataset
x_train_transform = transformer.fit_transform(x_train)

# transform the test dataset 
x_test_transform=transformer.transform(x_test)
print(x_train_transform.shape, x_test_transform.shape)

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
2.Pipeline 
</h1>

In [None]:
import pandas as pd 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

#1) DATA Ingestion
df=sns.load_dataset('titanic')
df=df.iloc[:, 0:8]
df.info()
df.isnull().sum()

# #Split Train-Test data split 
x_train, x_test, y_train, y_test=train_test_split(
    df.iloc[:, 1:8],df.iloc[:, 0], test_size=0.2, random_state=0)
x_train
#2) Creating PIPELINE FOR TRANSFORMATION 
#Transformation: 1st define the Individual transformation then execute in sequence through pipeline  
#     1st: Imputation transformation in age & embark ,
#     2nd: OneHotEncoding in Sex and embark 
#     3rd: scalling all the columns 
#     4th: Feature selection 
#     5th: Train the model 

#2.1) IMPUTATION TRANSFORMATION 
trf1=ColumnTransformer([
    ('impute-age', SimpleImputer(strategy='mean'), [2]), 
    ('impute-embark', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

#2.2) ONE HOT ENCODING 
trf2=ColumnTransformer([
    ('ohe-sex-embark', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
], remainder='passthrough')

#2.3) Scaling 
trf3=ColumnTransformer([
    ('age-scaling', MinMaxScaler(), slice(0,10)) #scaling all cols after trf1&2
]) 

#2.4) Feature Selections
trf4=SelectKBest(score_func=chi2, k=8) # Selectinf 8/10 best cols 

#2.5) model 
trf5=DecisionTreeClassifier()

#3)CREATE PIPELINE 
pipe=Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
])
display(pipe)
#4) TRAIN the model
pipe.fit(x_train, y_train)

#5) Predict: since model is trained we predict the value 
y_pred= pipe.predict(x_test)
y_pred

#6) CALCULATE ACCURACY 
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

#7) CROSS VALIDATION using pipeline 
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()

#8)HYPERTUNING Using GridsearchCV 
params={
    'trf5__max_depth':[1,2,3,4,5, None]
}
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)
print("Best score=",grid.best_score_, "Max_depth=", grid.best_params_)

#9)Exporting the PIPELINE
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

#10)CODE IN PRODUCTION 
import pickle 
import numpy as np
pipe=pickle.load(open('pipe.pkl', 'rb'))
#Assume user input 
test_input1 = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'], dtype='object').reshape(1,7)
pipe.predict(test_input1)

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
3.Mathematical Transformers
</h1>

<h1 style="color:deeppink; text-align:left; font-size:150%; font-weight:bold;">
3.1.Log Transformers 
</h1>

In [None]:
import numpy as np
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
#dataset
data = sns.load_dataset('titanic')
# Apply log transformation
log_transformed_data = np.log(data['age'])
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'], ax=ax1,  kde=True)
ax1.set_title('Before Transformation')
sns.histplot(log_transformed_data, ax=ax2,  kde=True)
ax2.set_title('After Log Transformation')

<h1 style="color:deeppink; text-align:left; font-size:150%; font-weight:bold;">
3.2.Reciprocal Transformers 
</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply log transformation
reciprocal_transformed_data = 1/(data['age'].dropna())
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(log_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After Reciprocal Transformation')

<h1 style="color:deeppink; text-align:left; font-size:150%; font-weight:bold;">
3.3.X-Square Transformers 
</h1>  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply x-square transformation
square_transformed_data = (data['age'].dropna())**2
#ploting data before and after log transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(square_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After X-square Transformation')

<h1 style="color:deeppink; text-align:left; font-size:150%; font-weight:bold;">
3.4.Square Root Transformers 
</h1>  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
# Example data
data = sns.load_dataset('titanic')
# Apply square root transformation
square_root_transformed_data = np.sqrt(data['age'].dropna())
#ploting data before and after square root transformer 
fig, (ax1, ax2) =plt.subplots(ncols=2, figsize=(5,2))
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Before Transformation')
sns.histplot(square_root_transformed_data.dropna(),ax=ax2, kde=True)
ax2.set_title('After square-root Transformation')

<h1 style="color:deeppink; text-align:left; font-size:150%; font-weight:bold;">
3.5.Custom Transformers 
</h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load example data
data = sns.load_dataset('titanic')
# Custom transformation function
def custom_transform(x, threshold=30):
    if x < threshold:
        return np.sqrt(x)
    else:
        return x ** 2

# Apply custom transformation to the 'age' column (handle missing values)
custom_transformed_data = data['age'].dropna().apply(custom_transform)

# Plotting data before and after custom transformation
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

# Original data (before transformation)
sns.histplot(data['age'].dropna(), ax=ax1, kde=True)
ax1.set_title('Original Age Data')

# Custom transformed data (after transformation)
sns.histplot(custom_transformed_data, ax=ax2, kde=True, color='purple')
ax2.set_title('Custom Transformed Age Data')

plt.tight_layout()
plt.show()

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
4.Funtional Transformers
</h1>  

In [None]:
import numpy as np 
import pandas as pd 
import scipy.stats as stats
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# 1) LOAD DATASET 
df=sns.load_dataset('titanic')
df=df[['age', 'fare', 'survived']]
df.info()

#2.1) Imputation of Age 
df['age'].fillna(df['age'].mean(), inplace=True)
df.isnull().sum() #check for Null values

#2.2) Extract the X and y from df  & DO Train-test split  
x=df.iloc[:, 0:2]
y=df.iloc[:, -1]
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=0)

#2.3) Checking the Normality of the data then Transforming 
#Ploting age 
plt.figure(figsize=(9,2))
plt.subplot(121)
sns.histplot(x_train['age'], kde=True) 
plt.subplot(122)
stats.probplot(x_train['age'], dist='norm', plot=plt)
plt.show()
#ploting fare 
plt.figure(figsize=(9,2))
plt.subplot(121)
sns.histplot(x_train['fare'], kde=True) 
plt.subplot(122)
stats.probplot(x_train['fare'], dist='norm', plot=plt)
plt.show()

#3) BEFORE NORMALIZATION: Fit in the model and check the accuracy 
clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
#traing the model 
clf1.fit(x_train, y_train)
clf2.fit(x_train, y_train)
#prediction 
y_pred1=clf1.predict(x_test)
y_pred2=clf2.predict(x_test)
#calculating the accuracy 
print('LR Accuracy=', accuracy_score(y_test, y_pred1))
print('DT Accuracy=', accuracy_score(y_test, y_pred2))

#4) AFTER NORMALIZATION: Fit in the model and check the accuracy 
trf=FunctionTransformer(func=np.log1p)

x_train_transform=trf.fit_transform(x_train)
x_test_transform=trf.transform(x_test)

clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
#traing the model 
clf1.fit(x_train_transform, y_train)
clf2.fit(x_train_transform, y_train)
#prediction 
y_pred_tranformed1=clf1.predict(x_test_transform)
y_pred_tranformed2=clf2.predict(x_test_transform)
#calculating the accuracy 
print('LR Accuracy=', accuracy_score(y_test, y_pred_tranformed1))
print('DT Accuracy=', accuracy_score(y_test, y_pred_tranformed2))

#5)CROSS VALIDATING THE RESULT FOR CONFIRMATION 
x_transformed=trf.fit_transform(x)
clf1=LogisticRegression()
clf2=DecisionTreeClassifier()
print('LR',np.mean(cross_val_score(clf1, x_transformed, y, scoring='accuracy', cv=15)))
print('DT',np.mean(cross_val_score(clf2, x_transformed, y, scoring='accuracy', cv=10)))

#6) NOT CHecking QQ plot of BEFORE AND AFTER FUNCTIONTRANSFORM
#before transform
plt.figure(figsize=(9,2))
plt.subplot(121)
stats.probplot(x_train['fare'], dist='norm', plot=plt)
#after Transform  
plt.subplot(122)
stats.probplot(x_train_transform['fare'], dist='norm', plot=plt)
plt.show()

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
5.Power Transformers
</h1>

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import scipy.stats as stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PowerTransformer

#Data Ingention 
df=pd.read_csv('concrete.csv')
df.sample(2)
df.info()
# TRAIN-TEST SPLIT
x=df.iloc[:, :-1]
y=df.iloc[:, -1] 
x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=0)

#BEFORE Transdormation: Trainig the model, predict, Accuracy 
lr=LinearRegression()
lr.fit(x_train, y_train)
y_pred=lr.predict(x_test)
print('LR accuracy=', r2_score(y_test, y_pred))
print('lr_crossvalidation', np.mean(cross_val_score(lr, x, y, scoring='r2', cv=10)))

#Plotting the QQ plot 
for i in x_train.columns:
    plt.figure(figsize=(6,2))
    plt.subplot(121)
    sns.histplot(x_train[i], kde=True)
    plt.title(i)
    plt.subplot(122)
    stats.probplot(x_train[i], dist='norm', plot=plt)
    plt.title(i)
    plt.show()

#TRANSFORMING 
pt=PowerTransformer(method='box-cox', standardize=True)
x_train_transform= pt.fit_transform(x_train+0.000001)
x_test_transform=pt.transform(x_test+0.000001)

lr=LinearRegression()
lr.fit(x_train_transform, y_train)
y_pred_tranform=lr.predict(x_test_transform)

print('LR accuracy after trans=', r2_score(y_test, y_pred_tranform))
#cross validation after transform
pt=PowerTransformer(method='box-cox', standardize=True)
x_trans=pt.fit_transform(x+0.000001)
print('LR_Trans_cross valid=', np.mean(cross_val_score(lr, x_trans, y, scoring='r2', cv=10)))

#Ploting the graph Before and After Transformation 
x_trans = pd.DataFrame(x_trans, columns=x_train.columns)
for i in x_trans.columns:
    plt.figure(figsize=(6, 2))
    plt.subplot(121)
    sns.histplot(x_train[i], kde=True)
    plt.title(f"Original: {i}")
    plt.subplot(122)
    column_index = x_train.columns.get_loc(i)  # Get the index of the column
    sns.histplot(x_train_transform[:, column_index], kde=True)  # Use index
    plt.show()

<h1 style="color: gold; text-align:left; font-size:200%; font-weight:bold;">
6.Quantile Transformers
</h1>