## Run clusters through three different classification algorithms, choose whichever performs the best

In [1]:
import os
import pandas as pd
from glob import glob
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import matthews_corrcoef as mc

In [2]:
def make_data_dict(file):
    fnames = glob(os.path.join('data',file,'*.csv'))
    data = {name.split('/')[-1][:-4]: pd.read_csv(name, index_col='Id') for name in fnames}
    #some of the clusters don't have any failuers, can't model without both classes
    dfs = [d for d in data.keys() if data[d]['Response'].sum()>1]
    return data, dfs

In [3]:
def get_class_weights(data, dfs):
    class_weights = {}
    for df in dfs:
        dfx = data[df]
        n_failures = dfx.Response.sum()
        class_weights[df] = n_failures/dfx.shape[0]
    return class_weights

In [4]:
def split_training(data, dfs):
    X_trains, X_tests, y_trains, y_tests = {},{},{},{}
    for df in dfs:
        X_trains[df], X_tests[df], y_trains[df], y_tests[df] = \
        train_test_split(data[df].drop('Response', axis=1).values, \
                         data[df]['Response'], \
                         stratify=data[df]['Response'])
    return X_trains, X_tests, y_trains, y_tests

In [5]:
def train_classifiers(dfs, class_weights, X_trains, X_tests, y_trains, y_tests, munge):
    lrs = {df: LogisticRegression(fit_intercept=True).fit(X_trains[df], y_trains[df]) for df in dfs}
    rfs = {df: RandomForestClassifier(max_features='sqrt',\
                                      class_weight={0:1-class_weights[df],\
                                      1:class_weights[df]}).fit(X_trains[df], y_trains[df])\
                                      for df in dfs}
    dts = {df: DecisionTreeClassifier(max_features='sqrt',\
                                      class_weight={0:1-class_weights[df],\
                                      1:class_weights[df]}).fit(X_trains[df], y_trains[df])\
                                      for df in dfs}
    
    classifiers = {munge:{'lrs': lrs, 'rfs': rfs, 'dts': dts}}
    return classifiers

In [6]:
def make_preds(dfs, classifiers, X_tests, munge):
    d_preds = {df: {} for df in dfs}
    for df in dfs:
        for name, classifier in classifiers[munge].items():
            d_preds[df][name] = classifier[df].predict(X_tests[df])
    return d_preds

In [7]:
def score(dfs, d_preds, y_tests):
    d_scores = {df: {} for df in dfs} 
    for df in dfs:
        for name, preds in d_preds[df].items():
            d_scores[df][name] = mc(y_tests[df], preds)
    return d_scores

In [8]:
def choose_classifier(d_scores, dfs):
    d_choice = {}
    for df in dfs:
        best = -1
        for name, score in d_scores[df].items():
            if score > best:
                best = score
                choice = name
        d_choice[df] = choice
    return d_choice

In [9]:
def get_result(munges):
    '''
    INPUT: lst of directories containing .csv files, 
    each entry cooresponding to a different munge
    OUTPUT: lst of dicts
    '''
    result = []
    models = {}
    for munge in munges:
        data, dfs = make_data_dict(munge)
        class_weights = get_class_weights(data, dfs)
        X_trains, X_tests, y_trains, y_tests = split_training(data, dfs)
        classifiers = train_classifiers(dfs, class_weights, X_trains, X_tests, y_trains, y_tests, munge)
        models[list(classifiers.keys())[0]] = classifiers[munge]
        d_preds = make_preds(dfs, classifiers[munge], X_tests)
        d_scores = score(dfs, d_preds, y_tests)
        d_choice = choose_classifier(d_scores, dfs)
        scores = {df: d_scores[df][d_choice[df]] for df in dfs}
        for df in dfs:
            d = {df:{}}
            d[df]['score'] = d_scores[df][d_choice[df]]
            d[df]['classifier'] = d_choice[df]
            d[df]['munge'] = munge    
            result.append(d)
    return result, dfs, models

In [10]:
def run():
    '''
    instead of 'decomp' being boolean, use the 
    filename as a value of 'munge' also use filename to keep track of models
    '''
    final = {}
    result, dfs, models = get_result(['pca', 'clusters'])
    hi_scores = {df:0.0 for df in dfs}
    for df in dfs:
        for d in result:
            dfx = list(d.keys())[0]
            if  dfx == df and d['score'] > hi_scores[dfx]:
                hi_scores[dfx] = d['score']
                final[dfx] = {'classifier': d[dfx]['classifier'], 'munge':d[dfx]['munge']}
    for name, d in final.items():
        with open(os.path.join('..','data','models', munge, name), 'wb') as f:
            pickle.dump(models[d['classifier']], f)
    with open(os.path.join('..','data','models','modelchoices.json'), 'wb') as f:
        pickle.dump(final, f)

In [None]:
run()

  mask |= (ar1 == a)
