## Run clusters through three different classification algorithms, choose whichever performs the best

In [1]:
import os
import pandas as pd
from glob import glob
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef as mc

In [2]:
def make_data_dict(file):
    fnames = glob(os.path.join('..','data',file,'*.csv'))
    data = {name.split('/')[-1][:-4]: pd.read_csv(name, index_col='Id') for name in fnames}
    #some of the clusters don't have any failuers, can't model without both classes
    dfs = [d for d in data.keys() if data[d]['Response'].sum()>1]
    return data, dfs

In [3]:
def make_class_weights(data, dfs):
    class_weights = {}
    for df in dfs:
        dfx = data[df]
        n_failures = dfx.Response.sum()
        class_weights[df] = n_failures/dfx.shape[0]
    return class_weights

In [4]:
def split_training(data, dfs):
    X_trains, X_tests, y_trains, y_tests = {},{},{},{}
    for df in dfs:
        X_trains[df], X_tests[df], y_trains[df], y_tests[df] = \
        train_test_split(data[df].drop('Response', axis=1).values, \
                         data[df]['Response'], \
                         stratify=data[df]['Response'])
    return X_trains, X_tests, y_trains, y_tests

In [5]:
def train_classifiers(dfs, class_weights, X_trains, X_tests, y_trains, y_tests):
    lrs = {df: LogisticRegression(fit_intercept=True).fit(X_trains[df], y_trains[df]) for df in dfs}
    rfs = {df: RandomForestClassifier(max_features='sqrt',\
                                      class_weight={0:1-class_weights[df],\
                                      1:class_weights[df]}).fit(X_trains[df], y_trains[df])\
                                      for df in dfs}
    dts = {df: DecisionTreeClassifier(max_features='sqrt',\
                                      class_weight={0:1-class_weights[df],\
                                      1:class_weights[df]}).fit(X_trains[df], y_trains[df])\
                                      for df in dfs}
    classifiers = {'lrs': lrs, 'rfs': rfs, 'dts': dts}
    return classifiers

In [6]:
def make_preds(dfs, classifiers, X_tests):
    d_preds = {df: {} for df in dfs}
    for df in dfs:
        for name, classifier in classifiers.items():
            d_preds[df][name] = classifier[df].predict(X_tests[df])
    return d_preds

In [7]:
def score(dfs, d_preds, y_tests):
    d_scores = {df: {} for df in dfs} 
    for df in dfs:
        for name, preds in d_preds[df].items():
            d_scores[df][name] = mc(y_tests[df], preds)
    return d_scores

In [8]:
def choose_classifier(d_scores, dfs):
    d_choice = {}
    for df in dfs:
        best = -1
        for name, score in d_scores[df].items():
            if score > best:
                best = score
                choice = name
        d_choice[df] = choice
    return d_choice

In [9]:
def run(file):
    '''
    file points to the directory where cluters are located
    run this function to see how different munging strategies
    effect final scores
    '''
    data, dfs = make_data_dict(file)
    class_weights = make_class_weights(data, dfs)
    X_trains, X_tests, y_trains, y_tests = split_training(data, dfs)
    classifiers = train_classifiers(dfs, class_weights, X_trains, X_tests, y_trains, y_tests)
    d_preds = make_preds(dfs, classifiers, X_tests)
    d_scores = score(dfs, d_preds, y_tests)
    d_choice = choose_classifier(d_scores, dfs)
    scores = {df: d_scores[df][d_choice[df]] for df in dfs}
    return d_choice, scores

In [10]:
d_choice1, scores1 = run('clusters')

  mask |= (ar1 == a)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [11]:
scores1

{'df10': 0.0,
 'df11': 0.19457212765016693,
 'df12': 0.26549784807400767,
 'df13': 0.18596345639229586,
 'df14': 0.47223155034497682,
 'df15': 0.11696310486853943,
 'df16': 0.0,
 'df17': 0.0,
 'df18': 0.085089932191228318,
 'df20': 0.11356308680361073,
 'df24': 0.0,
 'df3': 0.069476263825991549,
 'df4': 0.0,
 'df5': 0.0,
 'df6': 0.0,
 'df7': 0.0,
 'df8': 0.0}

In [12]:
d_choice2, scores2 = run('pca')

  mask |= (ar1 == a)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


In [None]:
scores2

In [None]:
df11-pca, df-12-reg, df-14-reg, df-15-pca, df-18-reg, df-20-pca, df-3-reg, df-4-pca, df-6-pca 

In [26]:
scores1

{'df10': 0.0,
 'df11': 0.0,
 'df12': 0.26549784807400767,
 'df13': 0.0,
 'df14': 0.17956068337157555,
 'df15': 0.11638032635903485,
 'df16': 0.0,
 'df17': 0.0,
 'df18': 0.16369443324914995,
 'df20': 0.055332113372554295,
 'df24': 0.0,
 'df3': 0.084280438403725844,
 'df4': 0.0,
 'df5': 0.0,
 'df6': 0.0,
 'df7': 0.0,
 'df8': 0.0}

In [28]:
scores2

{'df10': 0.0,
 'df11': 0.35659399532816743,
 'df12': 0.0,
 'df13': 0.0,
 'df14': 0.12437863179442786,
 'df15': 0.22174730466123255,
 'df16': 0.0,
 'df17': 0.0,
 'df18': 0.12455931821902196,
 'df20': 0.12078333958308288,
 'df24': 0.0,
 'df3': 0.018734217374691813,
 'df4': 0.40139635583086408,
 'df5': 0.0,
 'df6': 0.35148122069129401,
 'df7': 0.0,
 'df8': 0.0}