In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

In [4]:
def modelfunc(X, y):
    pipelines = [
        ('LOGISTIC REGRESSION', (Pipeline([ ('LG', LogisticRegression(max_iter=800))]))),
        ('DECISION TREE', (Pipeline([ ('TREE', DecisionTreeClassifier())]))),
        ('BAGGED TREE', (Pipeline([ ('BAG', BaggingClassifier())]))),
        ('RANDOM FOREST', (Pipeline([ ('RAND', RandomForestClassifier())]))),
        ('ADABOOST', (Pipeline([ ('ADA', AdaBoostClassifier())]))),
        ('KNN', (Pipeline([ ('sc', StandardScaler()),('KNN', KNeighborsClassifier())])))
    ]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 
    stats = {}
    for pipe_name ,model in pipelines:
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        trainscore = model.score(X_train, y_train)
        testscore = model.score(X_test, y_test)
        crossval = cross_val_score(model, X_train, y_train).mean()
        acc_tr= metrics.accuracy_score(y_train, y_pred_train)
        acc_te = metrics.accuracy_score(y_test, y_pred_test)
        stats[pipe_name] ={'train':trainscore, 'test':testscore, 'cross':crossval}
    return stats

In [11]:
best_results = {'LOGISTIC REGRESSION':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0},
                'DECISION TREE':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0},
                'BAGGED TREE':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0},
                'RANDOM FOREST':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0},
                'ADABOOST':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0},
                'KNN':{'Train':0, 'Test':0, 'Crossval':0, 'Clusters':0}
               }

for i in range(4, 11):
   
    # read and dummify df
    df = pd.read_csv('../data/concat_dfs.csv')
    df = pd.get_dummies(df, columns=['Class', 'Account'])
    
    #collect dummified column names
    col_names = []
    for col in df.columns:
        if 'Account' in col or 'Class' in col:
            col_names.append(col)

    #feature selection
    features = ['Level', 'Dead'] + col_names
    X = df[features]

    ss = StandardScaler()
    X_scaled = ss.fit_transform(X)

    #cluster the df i times
    km = KMeans(n_clusters=i, random_state=42)
    km.fit(X_scaled)
    df['cluster'] = km.labels_
    
    #recollect column names for model testing
    col_names2 = []
    for col in df.columns:
        if 'Class' in col:
            col_names2.append(col)
        #if 'Account' in col:
            col_names2.append(col)

    #test model results for i clusters
    features2 = ['cluster', 'Level'] + col_names2
    X2 = df[features2]
    y = df['Dead']
    stats = modelfunc(X2, y)
    for key in best_results:
        if stats[key]['test'] > best_results[key]['Test']:
            best_results[key]['Test'] = stats[key]['test']
            best_results[key]['Train'] = stats[key]['train']
            best_results[key]['Crossval'] = stats[key]['cross']
            best_results[key]['Clusters'] = i
    #pull results
best_results

{'LOGISTIC REGRESSION': {'Train': 0.9290833333333334,
  'Test': 0.92475,
  'Crossval': 0.9278333333333334,
  'Clusters': 8},
 'DECISION TREE': {'Train': 0.9464166666666667,
  'Test': 0.9365,
  'Crossval': 0.9391666666666666,
  'Clusters': 8},
 'BAGGED TREE': {'Train': 0.9460833333333334,
  'Test': 0.9375,
  'Crossval': 0.9407500000000001,
  'Clusters': 9},
 'RANDOM FOREST': {'Train': 0.9464166666666667,
  'Test': 0.93825,
  'Crossval': 0.9406666666666667,
  'Clusters': 8},
 'ADABOOST': {'Train': 0.9386666666666666,
  'Test': 0.9365,
  'Crossval': 0.9385833333333334,
  'Clusters': 8},
 'KNN': {'Train': 0.9395,
  'Test': 0.93225,
  'Crossval': 0.9369999999999999,
  'Clusters': 9}}