In [13]:
from sklearn import metrics
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
import pickle
import numpy as np

In [46]:
X = pickle.load( open( "data_prepared/X.pkl", "rb" ) )
X = X.tocsr()

Y_churn = pickle.load( open( "data_prepared/Y_churn.pkl", "rb" ) )
Y_appet = pickle.load( open( "data_prepared/Y_appet.pkl", "rb" ) )
Y_upsell = pickle.load( open( "data_prepared/Y_upsell.pkl", "rb" ) )

In [51]:
def gridSearch(X, y, predName):
    cv = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    results = np.empty([0, 3])

    for train_index, test_index in cv.split(X, y):
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        count = 0
        for n_estimators in range(50,550,50):
            for max_depth in range(1,16):
                print('count:', count)
                count += 1
                mdl = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,  n_jobs=8, class_weight='balanced')
                mdl.fit(X_train, y_train)
                
                pred = mdl.predict(X_test)
                auc = metrics.roc_auc_score(y_test,pred)
                
                output = np.array([n_estimators,max_depth, auc])
                results = np.vstack((results, output))
                print('n_estimators:', n_estimators, 'max_depth:', max_depth, 'auc:', auc)
                
    n_estimators = results[np.argmax(results[:,2]),0]
    max_depth = results[np.argmax(results[:,2]),1]
    auc = results[np.argmax(results[:,2]),2]
    return auc, int(n_estimators), int(max_depth)


def crossVal(X, y, predName, n_estimators, max_depth):
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=0)
    aucResults = []
    f1Results = []
    count = 0 
    
    for train_index, test_index in cv.split(X, y):
        print(count, 'CV split is going on ...')
        count += 1
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        mdl = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth,  n_jobs=8, class_weight='balanced')
        mdl.fit(X_train, y_train)
        
        pred = mdl.predict(X_test)
        auc = metrics.roc_auc_score(y_test,pred)
        f1 = metrics.f1_score(y_test,pred)
        
        aucResults.append(auc)
        f1Results.append(f1)

    return aucResults, f1Results

In [52]:
print('Training models...')
auc, n_estimators, max_depth = gridSearch(X, Y_churn, 'churn')

print('churn - best random forest model: AUC=',auc,
      'n_estimators=',n_estimators,'max_depth=',max_depth)

aucResults, f1Results = crossVal(X, Y_churn, 'churn', n_estimators, max_depth)

print('AUC mean=',np.mean(aucResults), 'std=',np.std(aucResults))


Training models...
count: 0
n_estimators: 50 max_depth: 1 auc: 0.610925655296
count: 1
n_estimators: 50 max_depth: 2 auc: 0.613219180609
count: 2
n_estimators: 50 max_depth: 3 auc: 0.625680947828
count: 3
n_estimators: 50 max_depth: 4 auc: 0.615981047688
count: 4
n_estimators: 50 max_depth: 5 auc: 0.621730292453
count: 5
n_estimators: 50 max_depth: 6 auc: 0.621049491826
count: 6
n_estimators: 50 max_depth: 7 auc: 0.621827346409
count: 7
n_estimators: 50 max_depth: 8 auc: 0.6275313026
count: 8
n_estimators: 50 max_depth: 9 auc: 0.62146994448
count: 9
n_estimators: 50 max_depth: 10 auc: 0.628162889315
count: 10
n_estimators: 50 max_depth: 11 auc: 0.622422329358
count: 11
n_estimators: 50 max_depth: 12 auc: 0.613932512467
count: 12
n_estimators: 50 max_depth: 13 auc: 0.601996544133
count: 13
n_estimators: 50 max_depth: 14 auc: 0.58797381761
count: 14
n_estimators: 50 max_depth: 15 auc: 0.576599368727
count: 15
n_estimators: 100 max_depth: 1 auc: 0.61945899436
count: 16
n_estimators: 100 m

n_estimators: 450 max_depth: 15 auc: 0.571299329709
count: 135
n_estimators: 500 max_depth: 1 auc: 0.605418849154
count: 136
n_estimators: 500 max_depth: 2 auc: 0.617655596431
count: 137
n_estimators: 500 max_depth: 3 auc: 0.616539083402
count: 138
n_estimators: 500 max_depth: 4 auc: 0.62203803887
count: 139
n_estimators: 500 max_depth: 5 auc: 0.627340678424
count: 140
n_estimators: 500 max_depth: 6 auc: 0.628233005646
count: 141
n_estimators: 500 max_depth: 7 auc: 0.632480073027
count: 142
n_estimators: 500 max_depth: 8 auc: 0.629499662814
count: 143
n_estimators: 500 max_depth: 9 auc: 0.624405507719
count: 144
n_estimators: 500 max_depth: 10 auc: 0.622250399599
count: 145
n_estimators: 500 max_depth: 11 auc: 0.616689620074
count: 146
n_estimators: 500 max_depth: 12 auc: 0.615907398554
count: 147
n_estimators: 500 max_depth: 13 auc: 0.607065871472
count: 148
n_estimators: 500 max_depth: 14 auc: 0.587591195391
count: 149
n_estimators: 500 max_depth: 15 auc: 0.571958786317
churn - best 