In [55]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [56]:
df = pd.read_csv('./Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [57]:
models = {
    'XGBoost': XGBClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-NN': KNeighborsClassifier(),
    'SVC': SVC(),
    'GauusianNB': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'CATBoost': CatBoostClassifier(verbose = 0),
} 

In [58]:
sc = StandardScaler()

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)
X_scaled = sc.fit_transform(X)

In [60]:
results = pd.DataFrame(columns = ['Score'])

In [61]:
for modelName in models:
    pipe = Pipeline((('scaler', sc), ('pca', PCA()), ('model', models[modelName])))
    param_grid = {'pca__n_components': np.arange(1, df.columns.size)}

    if modelName == 'K-NN':
        param_grid['model__n_neighbors'] = np.arange(1, 50)
        Score = GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
        
    elif modelName == 'XGBoost':
        Score = cross_val_score(XGBClassifier(), X, LabelEncoder().fit_transform(y), cv = 10).mean()
        
    elif modelName == 'SVC':
        param_grid['model__kernel'] = ['rbf', 'poly', 'sigmoid']
        Score = GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
        
    elif modelName == 'Decision Tree':
        param_grid['model__criterion'] = ['gini', 'entropy', 'log_loss']
        param_grid['model__splitter'] = ['best', 'random']
        Score = GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
        
    elif modelName == 'Random Forest':
        param_grid['model__criterion'] = ['gini', 'entropy', 'log_loss']
        param_grid['model__max_features'] = ['sqrt', 'log2']
        param_grid['model__class_weight'] = ['balanced', 'balanced_subsample']
        Score = GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
    
    else:
        Score = GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
    
    results.loc[modelName] = dict(Score = '{:.2f}'.format(Score * 100) + '%')

In [62]:
results.sort_values('Score', ascending = False)

Unnamed: 0,Score
Random Forest,97.66%
Decision Tree,97.66%
K-NN,97.51%
SVC,97.51%
CATBoost,97.37%
Logistic Regression,96.94%
GauusianNB,96.93%
XGBoost,96.49%
