In [None]:
#LEVEL AGNOSTIC MODEL
import os
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, precision_score
from sklearn.feature_selection import RFE, VarianceThreshold, SelectFromModel
import numpy as np

np.random.seed(0)

DO_STANDARDIZE = False
DO_POLY = False
DO_POLY_INTERACT = False
DO_PCA = False
DO_SELECT_FROM_MODEL = True
DO_RFE = False

NUM_FEAT = 40 #used only for PCA
pca = PCA(n_components = NUM_FEAT)
POLY_DEGREE = 2 #2nd degree polynomial features
N_ESTIMATORS = 15 #number of trees
DEPTH = 7 #depth of trees


fname = "ModelA_full_level_agnostic_test_new_all_extra_stud.csv" ##CHANGE FILE NAME HERE
df_ml = pd.read_csv(os.path.join(fname)

Y = df_ml.loc[:, ['quit', 'fold']]
X = df_ml.drop(['userId', 'total_records', 'quit', 'fold'], axis=1)

#DATA PREPROCESSING - OUTER LOOP
#standardize features
if DO_STANDARDIZE is True:
    X = (X - X.mean()) / X.std()#(X.max() - X.min())

X['fold'] = Y['fold']

N = X.shape[0] 
D = X.shape[1]-1 #extra fold column

print "N =", N
print "D =", D

print "Quit percent overall =", float(sum(Y['quit']))/float(N)

hyp_list = [a+1 for a in range(DEPTH)[::1]] #hyperparameter
hyp_list2 = [a+1 for a in range(N_ESTIMATORS)[::3]]
best_auc = -1
best_f1 = -1
best_hyp_auc = None
best_hyp_f1 = None
best_hyp_auc2 = None
best_hyp_f12 = None

#HYPERPARAMETER LOOP 
for hyp in hyp_list:
    for hyp2 in hyp_list2:
        auc = []
        f1 = []
        precision = []

        #CROSS VALIDATION LOOP
        for i in range(5):

            #train-test split
            st = set([1,2,3,4,5])
            st.remove(i+1)

            X_test = X[X['fold']==i+1]
            X_test = X_test.drop(['fold'], axis=1).as_matrix()

            X_train = X[X['fold'].isin(list(st))]
            X_train = X_train.drop(['fold'], axis=1).as_matrix()

            Y_test = Y[Y['fold']==i+1]
            Y_test = Y_test.loc[:, 'quit'].as_matrix()

            Y_train = Y[Y['fold'].isin(list(st))]
            Y_train = Y_train.loc[:, 'quit'].as_matrix()

            #DATA PREPROCESSING - INNER LOOP
            #dimensionality reduction
            if DO_PCA is True:
                pca.fit(X_train) #NUM_FEAT specified earlier
                X_train = pca.transform(X_train)
                X_test = pca.transform(X_test) 

            #polynomial features 
            if DO_POLY is True:
                poly = PolynomialFeatures(POLY_DEGREE) 
                poly.fit(X_train)
                X_train = poly.transform(X_train)
                X_test = poly.transform(X_test)

            #polynomial features - interaction only
            if DO_POLY_INTERACT is True:
                poly = PolynomialFeatures(interaction_only=True)
                poly.fit(X_train)
                X_train = poly.transform(X_train)
                X_test = poly.transform(X_test)

            #clf = RandomForestClassifier(max_depth=hyp, n_estimators=N_ESTIMATORS, random_state=0)
            clf = GradientBoostingClassifier(max_depth=hyp, n_estimators=hyp2, random_state=0)

            #FEATURE SELECTION - Select from model
            if DO_SELECT_FROM_MODEL is True:
                try:
                    clf.fit(X_train, Y_train)
                except:
                    skip_file.append(fname)
                    continue 
                model = SelectFromModel(clf, prefit=True) #keep feature whose importance is greater than mean
                X_train = model.transform(X_train)
                X_test = model.transform(X_test)   

            #FEATURE SELECTION LOOP 
            if DO_RFE is True:
                rfe = RFE(clf, ) #half the features selected by default
                rfe = rfe.fit(X_train, Y_train)
                X_train = X_train[:,rfe.support_]
                X_test = X_test[:,rfe.support_]

            clf.fit(X_train, Y_train)

            Y_pred = clf.predict(X_test) #binary
            Y_prob = clf.predict_proba(X_test) #probability

            try:
                auc.append(roc_auc_score(Y_test, Y_prob[:,1]))
                f1.append(f1_score(Y_test, Y_pred))
                precision.append(precision_score(Y_test, Y_pred))
            except:
                skip_file.append(fname)
                continue #only happens for work it up - all quit in a fold

        if np.mean(auc) > best_auc:
            best_auc = np.mean(auc)
            best_auc_f1 = np.mean(f1)
            best_hyp_auc = hyp
            best_hyp_auc2 = hyp2
            print "AUC - Hyp =", best_hyp_auc, "Hyp2 =", best_hyp_auc2, "F1 =", best_auc_f1, "CV auc =", best_auc

        if np.mean(f1) > best_f1:
            best_f1 = np.mean(f1)
            best_f1_auc = np.mean(auc)
            best_hyp_f1 = hyp
            best_hyp_f12 = hyp2
            print "F1 - Hyp =", best_hyp_f1,"Hyp2 =", best_hyp_f12, "CV f1 =", best_f1, "AUC =", best_f1_auc
    

In [None]:
#LEVEL SPECIFIC MODELS 
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import average_precision_score, f1_score, roc_auc_score, precision_score
from sklearn.feature_selection import RFE, VarianceThreshold, SelectFromModel
import numpy as np

np.random.seed(0)
DO_SELECT_FROM_MODEL = True
model_performance_dict = {}
skip_file = []

for fname in os.listdir("TestFullNewLevels"):
        
    if fname.endswith(".csv") and not fname.startswith('.'):
        print "Processing", fname
        df_ml = pd.read_csv(os.path.join("TestFullNewLevels/", fname))
        num_users = len(df_ml['userId'].unique())

        Y = df_ml.loc[:, ['quit', 'fold']]
        X = df_ml.drop(['userId', 'total_records', 'quit'], axis=1)
        X = df_ml.drop(['userId', 'total_records', 'quit', 'number_pauses_attempt','total_pause_duration_attempt', 'time_elapsed_last_pause', 'num_pauses', 'last_pause_duration'], axis=1)


        N = X.shape[0] 
        D = X.shape[1]-1 #extra fold column

        print "N =", N
        print "D =", D

        print "Quit percent overall =", float(sum(Y['quit']))/float(N)

        auc = []
        f1 = []
        precision = []

        #CROSS VALIDATION LOOP
        for i in range(5):

            #train-test split
            st = set([1,2,3,4,5])
            st.remove(i+1)

            X_test = X[X['fold']==i+1]
            X_test = X_test.drop(['fold'], axis=1).as_matrix()

            X_train = X[X['fold'].isin(list(st))]
            X_train = X_train.drop(['fold'], axis=1).as_matrix()

            Y_test = Y[Y['fold']==i+1]
            Y_test = Y_test.loc[:, 'quit'].as_matrix()

            Y_train = Y[Y['fold'].isin(list(st))]
            Y_train = Y_train.loc[:, 'quit'].as_matrix()

            clf = GradientBoostingClassifier(random_state=0)

            #FEATURE SELECTION - Select from model
            if DO_SELECT_FROM_MODEL is True:
                try:
                    clf.fit(X_train, Y_train)
                except:
                    skip_file.append(fname)
                    continue 
                model = SelectFromModel(clf, prefit=True) #keep feature whose importance is greater than mean
                X_train = model.transform(X_train)
                X_test = model.transform(X_test)   

            clf.fit(X_train, Y_train)

            Y_pred = clf.predict(X_test) #binary
            Y_prob = clf.predict_proba(X_test) #probability

            try:
                auc.append(roc_auc_score(Y_test, Y_prob[:,1]))
                f1.append(f1_score(Y_test, Y_pred))
                precision.append(precision_score(Y_test, Y_pred))
            except:
                skip_file.append(fname)
                continue #only happens for work it up - all quit in a fold
        print "---------------------------------------"
        print "---------------------------------------"

        model_performance_dict[fname] = [np.mean(auc), np.mean(f1), float(sum(Y['quit']))/float(N), num_users]
    
print skip_file