## Importing Necessary Libraries for Machine Learning Pipeline


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler         
from sklearn.preprocessing import OneHotEncoder          
from sklearn.impute import SimpleImputer                
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

In [2]:
df = pd.read_csv('binary_classifier_dataset.csv')

## Automated Classification Pipeline with Hyperparameter Tuning


In [3]:
def model(dataset):
    X_list = []
    y_list = []
    for _ in dataset.columns:
        if len(dataset[_].unique()) == 2 and dataset[_].isna().sum() == 0:
            y_list.append(_)
        else: 
            X_list.append(_)

    X = dataset[X_list]
    y = dataset[y_list]

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=123)

    categorical_cols = dataset.select_dtypes('object').columns.tolist()
    numerical_cols = dataset.select_dtypes(include='number').columns.tolist()

    y_dtype = y[y.columns[0]].dtype

    if y_dtype == 'object':
        categorical_cols.remove(y_list[0])
    elif y_dtype in ['int8', 'int16', 'int32', 'int64','uint8', 'uint16', 'uint32', 'uint64','float16', 'float32', 'float64','complex64', 'complex128','bool']:
        numerical_cols.remove(y_list[0])

    if dataset[numerical_cols].isna().sum().sum() > 0:
        numerical_pipeline = Pipeline([
            ('SimpleImputer', SimpleImputer(strategy='median')),
            ('StandardScaler', StandardScaler())
        ])
    else:
        numerical_pipeline = Pipeline([
            ('StandardScaler', StandardScaler())
        ])

    if dataset[categorical_cols].isna().sum().sum() > 0:
        categorical_pipeline = Pipeline([
            ('SimpleImputer', SimpleImputer(strategy='most_frequent')),
            ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))
        ])
    else:
        categorical_pipeline = Pipeline([
            ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))
        ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('categorical_cols', categorical_pipeline, categorical_cols),
        ('numerical_cols', numerical_pipeline, numerical_cols)
    ])

    param_grids = {
        'LogisticRegression': {
            'model__C': uniform(0.01, 10),
            'model__penalty': ['l2'],  
            'model__solver': ['lbfgs', 'saga'],
            'model__max_iter': [100, 200, 500]
        },
        'RandomForestClassifier': {
            'model__n_estimators': randint(50, 200),
            'model__max_depth': randint(2, 20),
            'model__min_samples_split': randint(2, 10),
            'model__min_samples_leaf': randint(1, 10)
        },
        'KNeighborsClassifier': {
            'model__n_neighbors': randint(1, 30),
            'model__weights': ['uniform', 'distance'],
            'model__p': [1, 2] 
        }
    }

    for clf_model in [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]:
        model_name = clf_model.__class__.__name__

        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('model', clf_model)
        ])

        param_dist = param_grids[model_name]

        search = RandomizedSearchCV(pipe, param_distributions=param_dist, 
                                    n_iter=20, scoring='accuracy', 
                                    cv=5, random_state=123, n_jobs=-1)

        search.fit(X_train, y_train.values.ravel())

        print(f"Best {model_name} accuracy: {search.best_score_:.4f}")
        print(f"Best {model_name} params: {search.best_params_}")

        y_pred = search.predict(X_test)
        print(f"Test accuracy for best {model_name}: {accuracy_score(y_test, y_pred):.4f}\n")


## Testing the Model Function


In [4]:
model(df)

Best LogisticRegression accuracy: 0.5975
Best LogisticRegression params: {'model__C': np.float64(6.974691855978616), 'model__max_iter': 500, 'model__penalty': 'l2', 'model__solver': 'lbfgs'}
Test accuracy for best LogisticRegression: 0.5250

Best RandomForestClassifier accuracy: 0.6088
Best RandomForestClassifier params: {'model__max_depth': 9, 'model__min_samples_leaf': 3, 'model__min_samples_split': 6, 'model__n_estimators': 97}
Test accuracy for best RandomForestClassifier: 0.5050

Best KNeighborsClassifier accuracy: 0.5988
Best KNeighborsClassifier params: {'model__n_neighbors': 8, 'model__p': 1, 'model__weights': 'uniform'}
Test accuracy for best KNeighborsClassifier: 0.5200

