In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./Data.csv')
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [4]:
models = {
    'XGBoost': XGBClassifier(),
    'Logistic Regression': LogisticRegression(),
    'K-NN': KNeighborsClassifier(),
    'SVC': SVC(),
    'GauusianNB': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'CATBoost': CatBoostClassifier(verbose = 0),
} 

In [5]:
dimensionalityReduction = {
    'Full': '',
    'PCA': PCA(),
    'LDA': LDA(),
}

In [6]:
results = pd.DataFrame(columns = list(dimensionalityReduction), index = list(models))

In [7]:
for technique in dimensionalityReduction.items():
    
    for modelName in models.items():
        estimator = Pipeline([('sc', StandardScaler()), ('technique', technique[1]), ('model', modelName[1])])
        param_grid = {'technique__n_components': np.arange(1, df.columns.size)}
        
        if technique[0] == 'Full':
            estimator.steps.pop(1)
            param_grid = dict()

        if modelName[0] == 'K-NN':
            param_grid['model__n_neighbors'] = np.arange(1, 50)
            Score = GridSearchCV(estimator, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
            
        elif modelName[0] == 'XGBoost':
            Score = GridSearchCV(estimator, param_grid).fit(X, LabelEncoder().fit_transform(y)).best_score_
            
        elif modelName[0] == 'SVC':
            param_grid['model__kernel'] = ['rbf', 'poly', 'sigmoid']
            Score = GridSearchCV(estimator, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
            
        elif modelName[0] == 'Decision Tree':
            param_grid['model__criterion'] = ['gini', 'entropy', 'log_loss']
            param_grid['model__splitter'] = ['best', 'random']
            Score = GridSearchCV(estimator, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
            
        elif modelName[0] == 'Random Forest':
            param_grid['model__criterion'] = ['gini', 'entropy', 'log_loss']
            param_grid['model__max_features'] = ['sqrt', 'log2']
            param_grid['model__class_weight'] = ['balanced', 'balanced_subsample']
            Score = GridSearchCV(estimator, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
        
        else:
            Score = GridSearchCV(estimator, param_grid, cv = 10, n_jobs = -1).fit(X, y).best_score_
        
        results.loc[modelName[0], technique[0]] = '{:.2f}'.format(Score * 100) + '%'

In [9]:
results.sort_values('Full', ascending = False)

Unnamed: 0,Full,PCA,LDA
Random Forest,97.37%,97.51%,95.48%
K-NN,97.08%,97.51%,97.23%
CATBoost,96.93%,97.37%,97.22%
SVC,96.78%,97.37%,96.93%
Logistic Regression,96.64%,96.94%,96.21%
GauusianNB,96.20%,96.93%,96.94%
XGBoost,95.18%,97.37%,95.47%
Decision Tree,94.74%,97.36%,95.48%
