In [3]:
# !pip install scikit-learn

from sklearn.preprocessing import FunctionTransformer,PowerTransformer,OneHotEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

### sklearn.preprocessing.FunctionTransformer

#### A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc.

In [None]:
log_transformer = FunctionTransformer(func=np.log, inverse_func=np.exp)
reciprocal_transformer = FunctionTransformer(func=np.reciprocal)
boxcox_transformer = PowerTransformer(method="box-cox",standardize=False)

transformations = [
    ("log",log_transformer),
    ("reciprocal",reciprocal_transformer),
    ("boxcox",boxcox_transformer)
]

X = train_data[num_column]
y= train_data["smoking"]

transform_scores = {}

n_splits=5
kf = KFold(n_splits=n_splits,shuffle=True,random_state=42)

logistic_model = LogisticRegression()

X_copy = X
y_copy = y

X_train, X_test, y_train, y_test = train_test_split(X_copy,y_copy, test_size=0.2,random_state=42)

for name, transformation in transformations:
    if name == "boxcox":
        # Fit the PowerTransformer object with the entire dataset
        transformation.fit(X_train)

        # Set the output format to pandas dataframe
        transformation.set_output(transform="pandas")

        # Transform the entire dataset using the fitted PowerTransformer object
        X_train_transformed = transformation.transform(X_train)
    else:
        # Transform the entire dataset using the other transformations
        X_train_transformed = transformation.transform(X_train)

    auc_scores=[]

    for train_index, val_index in kf.split(X_train_transformed,y_train):
        X_train_fold, X_val_fold= X_train_transformed.iloc[train_index],X_train_transformed.iloc[val_index]
        y_train_fold,y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        logistic_model.fit(X_train_fold,y_train_fold)

        y_pred = logistic_model.predict_proba(X_val_fold)[:,1]
        auc_scores.append(roc_auc_score(y_val_fold,y_pred))
        # print(auc_scores)

    mean_auc_scores = np.mean(auc_scores)
    print(mean_auc_scores)
    transform_scores[f"{name}"]=mean_auc_scores

best_transformation =max(transform_scores,key=transform_scores.get)

best_auc_score = transform_scores[best_transformation]

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("numeric",log_transformer,num_column),
    ("categorical",OneHotEncoder(sparse=False),cat_col)
])

train_data_preprocessed = preprocessor.fit_transform(train_data_model)
cat_features_name = preprocessor.named_transformers_['categorical'].get_feature_names_out(cat_col)

cat_features_name= (cat_features_name.tolist())
column_names = (num_column.to_list()+cat_features_name)

train_data_preprocessed_df = pd.DataFrame(train_data_preprocessed, columns=column_names)

X = train_data_preprocessed_df
y=train_data_model['smoking']

X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2 ,random_state=42)


models = {
    "RandomForest":RandomForestClassifier(),
    "DecisionTree":DecisionTreeClassifier(),
    "LogisticRegression":LogisticRegression()
}

n_splits=5
kf = KFold(n_splits=n_splits,shuffle=True,random_state=42)



results_list = []

mean_auc_scores = {}



for model_name, model in models.items():
    auc_scores = []
    for fold_index, (train_index,val_index) in enumerate(kf.split(X_train,y_train),1):
        X_train_fold,X_val = X_train.iloc[train_index],X_train.iloc[val_index]
        y_train_fold,y_val = y_train.iloc[train_index],y_train.iloc[val_index]

        model.fit(X_train_fold,y_train_fold)

        y_pred_proba = model.predict_proba(X_val)[:,1]

        fpr,tpr,threshold = roc_curve(y_val,y_pred_proba)
        auc1 = roc_auc_score(y_val,y_pred_proba)
        auc_scores.append(auc1)

        precision = precision_score(y_val,model.predict(X_val))
        recall = recall_score(y_val,model.predict(X_val))
        accuracy = accuracy_score(y_val,model.predict(X_val))

        # results = results.append({"Model":model_name,"Fold":fold_index,"AUC":auc1,"Precision":precision,"Recall":recall,"Accuracy":accuracy},ignore_index=True)
        results_list.append([model_name,fold_index,auc1,precision,recall,accuracy])
    mean_auc = np.mean(auc_scores)
    mean_auc_scores[model_name] = mean_auc

results = pd.DataFrame(results_list,columns=['Model','Fold','AUC','Precision','Recall','Accuracy'])

print(results)

In [None]:
pipeline = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("model",best_model)
])

Key Differences for ColumnTransfomer and Pipeline

Scope: The primary difference lies in their scope. Pipeline is broader and can encapsulate an entire workflow of data transformation and model application, while ColumnTransformer is focused on applying specific transformations to columns within a dataset.

Flexibility in Data Processing: ColumnTransformer provides the flexibility to apply different preprocessing techniques to different columns within a dataset, which is not inherently the focus of a Pipeline.

Integration: ColumnTransformer is often used as a step within a Pipeline, especially when different transformations are required for different types of data within the dataset. On the other hand, a Pipeline is used to streamline the entire process from data preprocessing to model training and prediction.

FunctionTransformer allows for custom, user-defined transformations to be seamlessly integrated into scikit-learn pipelines, while ColumnTransformer provides a structured way to apply different preprocessing steps to different columns based on their data type or specific requirements. 