In [1]:
import pandas as pd
import numpy as np
import os
import time
from matplotlib import pyplot as plt
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import Normalizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from joblib import dump, load
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import json


# Code for visualise the default number

In [2]:
def visualise_default():
    year_list = ['2004', '2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015','2016','2017','2018']
    data_ = []
    default = []
    total = []
    for year_ in year_list:
        print("Loading data...")
        data_path = 'data_flag/{year}_flag.csv'.format(year=year_)  
        cols = pd.read_csv(data_path).columns
        data = pd.read_csv(data_path, usecols = cols[1:]) 
        y = np.asarray(data['default_flag'].astype(int))
        u_ele, ct_ele = np.unique(y, return_counts=True)
        default.append(ct_ele[1])
        total.append(np.size(y))
    data_.append(default)
    data_.append(total)
    columns = ['%d' % x for x in np.arange(2004,2019)]
    rows = ['default','number of accounts'] 
    fig, ax = plt.subplots(figsize=(30, 20))
    values = np.arange(0, 2500, 500)
    colors = plt.cm.BuPu(np.linspace(0, 0.5, len(rows)))
    n_rows = len(data_)

    index = np.arange(len(columns)) + 0.6
    bar_width = 0.4

    # Initialize the vertical-offset for the stacked bar chart.
    y_offset = np.zeros(len(columns))

    # Plot bars and create text labels for the table
    for row in range(n_rows):
        bar = plt.bar(index, data_[row], bar_width, bottom=y_offset, color=colors[row])
        y_offset = y_offset + data_[row]
        cell_text.append(['%d' % x for x in y_offset])
    
    colors = colors[::1]

    # Add a table at the bottom of the axes
    the_table = plt.table(cellText=data_,
                          rowLabels=rows,
                          rowColours=colors,
                          colLabels=columns,
                          loc='bottom')
    the_table.set_fontsize(25)
    the_table.scale(1.0, 2.0)  # may help

    # Adjust layout to make room for the table:
    values = np.arange(0, 2000000, 500000)
    plt.ylabel("Person",fontsize = 20)
    plt.yticks(values, ['%d' % val for val in values], fontsize = 20)
    plt.xticks([])
    plt.title('Default counts by years',fontsize = 20)

    plt.show()
    fig.savefig('./Defaults.jpg')

# Load Data


In [3]:
def load_data(data_path, data_path_time):
    time_start=time.time()
    
    #Load data
    data_list = []
    for fname in sorted(os.listdir(data_path)):
        subject_data_path = os.path.join(data_path, fname)
        print(subject_data_path)
        
        if not os.path.isfile(subject_data_path): continue
        data_list.append(
            pd.read_csv(
                        subject_data_path,
                        sep='|', 
                        header=None,
                        names = [
                                'CREDIT_SCORE',
                                'FIRST_PAYMENT_DATE',
                                'FIRST_TIME_HOMEBUYER_FLAG',
                                '4','5','6',
                                'NUMBER_OF_UNITS',
                                'OCCUPANCY_STATUS',
                                '9',
                                'ORIGINAL_DTI_RATIO',
                                'ORIGINAL_UPB',
                                'ORIGINAL_LTV',
                                'ORIGINAL_INTEREST_RATE',
                                'CHANNEL',
                                '15',
                                'PRODUCT_TYPE',
                                'PROPERTY_STATE',
                                'PROPERTY_TYPE',
                                '19',
                                'LOAN_SQ_NUMBER',
                                'LOAN_PURPOSE',
                                'ORIGINAL_LOAN_TERM',
                                'NUMBER_OF_BORROWERS',
                                '24','25','26'#,'27'#data from every year may have different column number
                            #2004-2007: 27 2008: 26 2009: 27
                        ],
                        usecols=[
                            'CREDIT_SCORE',
                            'FIRST_TIME_HOMEBUYER_FLAG',
                            'NUMBER_OF_UNITS',
                            'OCCUPANCY_STATUS',
                            'ORIGINAL_DTI_RATIO',
                            'ORIGINAL_UPB',
                            'ORIGINAL_LTV',
                            'ORIGINAL_INTEREST_RATE',
                            'CHANNEL',
                            'PROPERTY_TYPE',
                            'LOAN_SQ_NUMBER',
                            'LOAN_PURPOSE',
                            'ORIGINAL_LOAN_TERM',
                            'NUMBER_OF_BORROWERS'
                        ],
                        dtype={'CREDIT_SCORE':np.float_, 
                               'FIRST_TIME_HOMEBUYER_FLAG':np.str, 
                               'NUMBER_OF_UNITS':np.int_, 
                               'OCCUPANCY_STATUS':np.str,
                               'ORIGINAL_DTI_RATIO':np.float_,
                               'ORIGINAL_UPB':np.float_,
                               'ORIGINAL_LTV':np.float_,
                               'ORIGINAL_INTEREST_RATE':np.float_,
                               'CHANNEL':np.str,
                               'PROPERTY_TYPE':np.str,
                               'LOAN_SQ_NUMBER':np.str,
                               'LOAN_PURPOSE':np.str,
                               'ORIGINAL_LOAN_TERM':np.int_,
                               'NUMBER_OF_BORROWERS':np.int_},
                        low_memory=False
                        )
        )
    data = pd.concat(data_list)
    
    #Load data with time
    data_p_list=[]
    for fname in sorted(os.listdir(data_path_time)):
        subject_data_path = os.path.join(data_path_time, fname)
        print(subject_data_path)
        if not os.path.isfile(subject_data_path): continue
        data_p_list.append(
            pd.read_csv(subject_data_path,
                             sep='|',
                             header=None,
                             usecols=[0,3,4],
                             dtype={'0':np.str, '3':np.str, '4':np.int_}
                            )
        )
    #data_p = pd.concat(data_p_list)
    
    #Calculate default
    default_list=[]
    for data_p in data_p_list:
        data_p[3] = data_p[3].astype(str)
        clean_index = data_p.iloc[:,1].str.isdigit()
        data_p_cleaned = data_p[clean_index].copy()
        data_p_cleaned[3] = data_p_cleaned[3].astype(int)
        data_less_than_48 = data_p_cleaned[data_p_cleaned[4] < 48]
        default_list.append(data_less_than_48[data_less_than_48[3] > 2])
    
    data_default = pd.concat(default_list)
    
    default_index = data['LOAN_SQ_NUMBER'].isin(data_default[0].tolist())
    
    
    
    data['default_flag']=default_index
    
    
    data.drop(columns=['LOAN_SQ_NUMBER'], inplace=True)
    #data.to_csv('data/historical_data_withflag.csv',index=False)
    
    #Imputation
    CREDIT_SCORE = data['CREDIT_SCORE']
    OIR = data['ORIGINAL_DTI_RATIO']
    LTV = data['ORIGINAL_LTV']
    CREDIT_clean = CREDIT_SCORE[CREDIT_SCORE != 9999]
    OIR_clean = OIR[OIR != 999]
    LTV_clean = LTV[LTV != 999]
    data['CREDIT_SCORE'] = data['CREDIT_SCORE'].apply(lambda x : CREDIT_clean.mean() if x == 9999 else x)
    data['ORIGINAL_DTI_RATIO'] = data['ORIGINAL_DTI_RATIO'].apply(lambda x : OIR_clean.mean() if x == 999 else x)
    data['ORIGINAL_LTV'] = data['ORIGINAL_LTV'].apply(lambda x : LTV_clean.mean() if x == 999 else x)
    
    
    
    #Timer stop
    time_end=time.time()
    print('Finished loading, time cost:',time_end-time_start,'s')
    return data

# Standardize

In [4]:
def data_standardize():
    years = ['2004', '2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']

    cols = pd.read_csv('data_flag/2004_flag.csv').columns
    data = pd.DataFrame(columns = cols[1:].append(pd.Index(['YEAR']))) 
    for year_ in years:
        data_path = 'data_flag/{year}_flag.csv'.format(year=year_)  
        cols = pd.read_csv(data_path).columns
        data_this_year = pd.read_csv(data_path, usecols = cols[1:]) 
        data_this_year['YEAR'] = year_
        data = data.append(data_this_year, ignore_index=True)


    year_list = data['YEAR']
    data.drop(columns=['YEAR'])

    data['FIRST_TIME_HOMEBUYER_FLAG'] = data['FIRST_TIME_HOMEBUYER_FLAG'].apply(lambda x : np.NaN if x == '9' else x)
    data['NUMBER_OF_UNITS'] = data['NUMBER_OF_UNITS'].apply(lambda x : np.NaN if x == 99 else x)
    data['CHANNEL'] = data['CHANNEL'].apply(lambda x : np.NaN if x == 'T' else x)
    data['PROPERTY_TYPE'] = data['PROPERTY_TYPE'].apply(lambda x : np.NaN if x == '99' else x)
    data['NUMBER_OF_BORROWERS'] = data['NUMBER_OF_BORROWERS'].apply(lambda x : np.NaN if x == 99 else x)


    output_array = np.asarray(data['default_flag'].astype(int))

    input_values = np.c_[
        data[['CREDIT_SCORE',#0
              'ORIGINAL_DTI_RATIO',#1
              'ORIGINAL_UPB',#2
              'ORIGINAL_LTV',#3
              'ORIGINAL_LOAN_TERM',#4
              'ORIGINAL_INTEREST_RATE'#5
             ]]
    ] 

    scaler = preprocessing.MinMaxScaler()
    scaler.fit(input_values)
    input_values = scaler.transform(input_values)

    input_dummies = np.c_[
        np.asarray(pd.get_dummies(data['FIRST_TIME_HOMEBUYER_FLAG'])), # N,Y,9 str remove 9
        np.asarray(pd.get_dummies(data['NUMBER_OF_UNITS'])), # 1 2 3 4 99 int remove 99
        np.asarray(pd.get_dummies(data['OCCUPANCY_STATUS'])), # P S I str
        np.asarray(pd.get_dummies(data['CHANNEL'])), # T R C B str remove T
        np.asarray(pd.get_dummies(data['PROPERTY_TYPE'])), # SF PU CO MH CP 99 str remove 99
        np.asarray(pd.get_dummies(data['LOAN_PURPOSE'])), # P N C str
        np.asarray(pd.get_dummies(data['NUMBER_OF_BORROWERS'])) # 2 1 99 int remove 99
    ]

    input_array = np.c_[
        input_values,
        input_dummies
    ]

    data_stand = pd.DataFrame(input_array)
    data_stand['YEAR'] = year_list

    output = pd.DataFrame(output_array)
    output['YEAR'] = year_list

    for year_ in years:
        print('Preparing {year} ...'.format(year=year_))
        data_path = 'data_train/{year}/'.format(year=year_)
        data_this_year = data_stand.loc[data_stand['YEAR'] == year_]
        data_this_year = data_this_year.drop(columns=['YEAR'])
        output_this_year = output.loc[output['YEAR'] == year_]
        output_this_year = output_this_year.drop(columns=['YEAR'])
        folder = os.getcwd() + '/data_train/{year}/'.format(year=year_)
        if not os.path.exists(folder):
            os.makedirs(folder)
        X_path = data_path + 'input.csv'
        y_path = data_path + 'output.csv'
        data_this_year.to_csv(X_path.format(year_))
        output_this_year.to_csv(y_path.format(year_))
        print('{year} done!'.format(year=year_))

# Logistic Regression

In [5]:
def train_log_N_S(data, fig, ax, penalty):
   
    time_start=time.time()
    #Get dummy value
   
    
    data['FIRST_TIME_HOMEBUYER_FLAG'] = data['FIRST_TIME_HOMEBUYER_FLAG'].apply(lambda x : np.NaN if x == '9' else x)
    data['NUMBER_OF_UNITS'] = data['NUMBER_OF_UNITS'].apply(lambda x : np.NaN if x == 99 else x)
    data['CHANNEL'] = data['CHANNEL'].apply(lambda x : np.NaN if x == 'T' else x)
    data['PROPERTY_TYPE'] = data['PROPERTY_TYPE'].apply(lambda x : np.NaN if x == '99' else x)
    data['NUMBER_OF_BORROWERS'] = data['NUMBER_OF_BORROWERS'].apply(lambda x : np.NaN if x == 99 else x)
    
    
    output_array = np.asarray(data['default_flag'].astype(int))
    input_array = np.c_[
        data[['CREDIT_SCORE',#0
              'ORIGINAL_DTI_RATIO',#1
              'ORIGINAL_UPB',#2
              'ORIGINAL_LTV',#3
              'ORIGINAL_LOAN_TERM',#4
              'ORIGINAL_INTEREST_RATE'#5
             ]],
        np.asarray(pd.get_dummies(data['FIRST_TIME_HOMEBUYER_FLAG'])), # N,Y,9 str remove 9
        np.asarray(pd.get_dummies(data['NUMBER_OF_UNITS'])), # 1 2 3 4 99 int remove 99
        np.asarray(pd.get_dummies(data['OCCUPANCY_STATUS'])), # P S I str
        np.asarray(pd.get_dummies(data['CHANNEL'])), # T R C B str remove T
        np.asarray(pd.get_dummies(data['PROPERTY_TYPE'])), # SF PU CO MH CP 99 str remove 99
        np.asarray(pd.get_dummies(data['LOAN_PURPOSE'])), # P N C str
        np.asarray(pd.get_dummies(data['NUMBER_OF_BORROWERS'])) # 2 1 99 int remove 99
    ]
    
    
    X = input_array 
    y = output_array
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        random_state=13
    )
    #Normalise
#     scaler = preprocessing.StandardScaler().fit(X_train)
#     X_train = scaler.transform(X_train)
#     X_test = scaler.transform(X_test)
    
#     min_max_scaler = preprocessing.MinMaxScaler()
#     X_train = min_max_scaler.fit_transform(X_train)
#     X_test = min_max_scaler.transform(X_test)
    
    if penalty == 1:
        classifier = LogisticRegression(
            tol= 1e-6,
            C=0.05,
            max_iter = 500,
            n_jobs = -1
        )
    else:
        classifier = LogisticRegression(
            penalty = 'none',
            tol= 1e-6,
            #C=0.05,
            #class_weight = 'balanced',
            #class_weight = {0:0.01, 1:0.99},
            #solver='sag',
            max_iter=500,
            n_jobs = -1
        )
    
    classifier.fit(X_train,y_train)
    
#     viz = plot_roc_curve(
#         classifier, 
#         X_test, 
#         y_test,
#         name='Test ROC'.format(0),
#         alpha=0.5, lw=1, ax=ax
#     )
    
#     viz_train = plot_roc_curve(
#         classifier, 
#         X_train, 
#         y_train,
#         name='Train ROC'.format(1),
#         alpha=0.5, lw=1, ax=ax
#     ) 
    
#     ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
#         label='Chance', alpha=.8)
    
#     ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
#        title="Receiver operating characteristic example")
#     ax.legend(loc="lower right")
    time_end=time.time()
    print('Training done, time cost:',time_end-time_start,'s') 
    #print(classifier.predict_proba(X_test)[:, 1])
    return classifier

In [6]:
def train_log_Y_S(X, y, fig, ax, penalty):
   
    time_start=time.time()
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        random_state=13
    )
    
    if penalty == 1:
        classifier = LogisticRegression(
            tol= 1e-5,
            C=0.05,
            max_iter=500,
            n_jobs = -1
        )
    else:
        classifier = LogisticRegression(
            penalty = 'none',
            tol= 1e-5,
            #class_weight = 'balanced',
            #class_weight = {0:0.01, 1:0.99},
            #solver='sag',
            max_iter=500,
            n_jobs = -1
        )
    
    classifier.fit(X_train,y_train)
    
#     viz = plot_roc_curve(
#         classifier, 
#         X_test, 
#         y_test,
#         name='Test ROC'.format(0),
#         alpha=0.5, lw=1, ax=ax
#     )
    
#     viz_train = plot_roc_curve(
#         classifier, 
#         X_train, 
#         y_train,
#         name='Train ROC'.format(1),
#         alpha=0.5, lw=1, ax=ax
#     ) 
    
    time_end=time.time()
    print('Training done, time cost:',time_end-time_start,'s') 
    return classifier

In [7]:
def train_log_diff(X, y, c):
   
    time_start=time.time()
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        random_state=13
    )
    
    classifier = LogisticRegression(
        penalty = 'l2',
        tol= 1e-5,
        C=c,
        #class_weight = 'balanced',
        #class_weight = {0:0.01, 1:0.99},
        #solver='sag',
        max_iter=1500,
        n_jobs = -1
    )
    
    classifier.fit(X_train,y_train)
    
    time_end=time.time()
    print('Training done, time cost:',time_end-time_start,'s')
    return classifier

In [8]:
def train_log_cross_validation(data):
    #Get dummy value
    output_array = np.asarray(data['default_flag'].astype(int))
    input_array = np.c_[
        data[['CREDIT_SCORE',#0
              'ORIGINAL_DTI_RATIO',#1
              'ORIGINAL_UPB',#2
              'ORIGINAL_LTV',#3
              'ORIGINAL_LOAN_TERM',#4
              'ORIGINAL_INTEREST_RATE'#5
             ]],
        np.asarray(pd.get_dummies(data['FIRST_TIME_HOMEBUYER_FLAG'])),
        np.asarray(pd.get_dummies(data['NUMBER_OF_UNITS'])),
        np.asarray(pd.get_dummies(data['OCCUPANCY_STATUS'])),
        np.asarray(pd.get_dummies(data['CHANNEL'])),
        np.asarray(pd.get_dummies(data['PROPERTY_TYPE'])),
        np.asarray(pd.get_dummies(data['LOAN_PURPOSE'])),
        np.asarray(pd.get_dummies(data['NUMBER_OF_BORROWERS']))
    ]
    
    #Normalise
    min_max_scaler = preprocessing.MinMaxScaler()
    input_array_N = min_max_scaler.fit_transform(input_array)
    
    X = input_array_N
    y = output_array
    
    #devide Flods for cv
    cv = StratifiedKFold(n_splits=6)
    #define classifier
    classifier = LogisticRegression(
        solver='saga',
        max_iter=1500
    )

    #tpr lists and auc value list
    tprs = []
    aucs = []
    
    #For ploting, prepare 500 points from 0-1
    mean_fpr = np.linspace(0, 1, 500)
      
    #Loop training for every fold
    for i, (train, test) in enumerate(cv.split(X, y)):
        classifier.fit(X[train], y[train])
        # put the curve in ax through 'ax = ax'
        viz = plot_roc_curve(classifier, X[test], y[test],
                             name='ROC fold {}'.format(i),
                             alpha=0.3, lw=1, ax=ax)

        #Plot every point (500) form 0-1, similiar to bin
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        #Buff the result 
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    #Plot chance
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    #mean value for each colomn
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    
    std_auc = np.std(aucs)
    #Plot mean
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    #Plot standard tpr (Doesn't know the point for this step yet)
    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    
    #Configure the diagram 
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
           title="Receiver operating characteristic example")
    ax.legend(loc="lower right")

# NN

In [9]:
# def train_NN_cv(data, year):
   
#     time_start=time.time()
#     #prepare config list
#     conf_list = [0.0001, 0.001, 0.01, 0.1]
    
#     #conf_list = [(2,2,),(3,3,),(5,5,),(10,10,)] #dont forget to change name!!!!!!!
#     #conf_list = [(2,),(5,),(10,),(15,)]
#     conf_name = "alpha"
#     para = dict()
    
#     #Get dummy value
    
#     data['FIRST_TIME_HOMEBUYER_FLAG'] = data['FIRST_TIME_HOMEBUYER_FLAG'].apply(lambda x : np.NaN if x == '9' else x)
#     data['NUMBER_OF_UNITS'] = data['NUMBER_OF_UNITS'].apply(lambda x : np.NaN if x == 99 else x)
#     data['CHANNEL'] = data['CHANNEL'].apply(lambda x : np.NaN if x == 'T' else x)
#     data['PROPERTY_TYPE'] = data['PROPERTY_TYPE'].apply(lambda x : np.NaN if x == '99' else x)
#     data['NUMBER_OF_BORROWERS'] = data['NUMBER_OF_BORROWERS'].apply(lambda x : np.NaN if x == 99 else x)
    
    
#     output_array = np.asarray(data['default_flag'].astype(int))
#     input_array = np.c_[
#         data[['CREDIT_SCORE',#0
#               'ORIGINAL_DTI_RATIO',#1
#               'ORIGINAL_UPB',#2
#               'ORIGINAL_LTV',#3
#               'ORIGINAL_LOAN_TERM',#4
#               'ORIGINAL_INTEREST_RATE'#5
#              ]] 
#     ]
        
#     scaler = preprocessing.StandardScaler().fit(input_array)
#     input_array_N = scaler.transform(input_array)
#     #input_array_N = preprocessing.normalize(input_array, norm='l2')
#     X = np.c_[
#         input_array_N,
#         np.asarray(pd.get_dummies(data['FIRST_TIME_HOMEBUYER_FLAG'])), # N,Y,9 str remove 9
#         np.asarray(pd.get_dummies(data['NUMBER_OF_UNITS'])), # 1 2 3 4 99 int remove 99
#         np.asarray(pd.get_dummies(data['OCCUPANCY_STATUS'])), # P S I str
#         np.asarray(pd.get_dummies(data['CHANNEL'])), # T R C B str remove T
#         np.asarray(pd.get_dummies(data['PROPERTY_TYPE'])), # SF PU CO MH CP 99 str remove 99
#         np.asarray(pd.get_dummies(data['LOAN_PURPOSE'])), # P N C str
#         np.asarray(pd.get_dummies(data['NUMBER_OF_BORROWERS'])) # 2 1 99 int remove 99    
#     ]
#     y = output_array
    
#     #devide Flods for cv
#     cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=10)
    
    
#     #tpr lists and auc value list
    
#     auc_all_config = []
#     mean_aucs = []
    
    
    
#     #For ploting, prepare 500 points from 0-1
#     mean_fpr = np.linspace(0, 1, 500)
      
#     #Loop training for every fold
#     for conf in conf_list:
#         print("Training "+str(conf)+"...")
#         tprs = []
#         aucs = []
#         fig, ax = plt.subplots(figsize=(15, 8)) 

#         for i, (train, test) in enumerate(cv.split(X, y)):
# ##############################################################################################################################      
#             classifier = MLPClassifier(
#                 hidden_layer_sizes = (10,),
#                 alpha = conf,
#                 learning_rate_init = 0.001,
#                 tol = 1e-05,
#                 verbose = True,
#                 n_iter_no_change = 10
#             )
#             para = classifier.get_params(deep=True)
# ##############################################################################################################################        
#             print("Training fold "+str(i)+"...")
#             classifier.fit(X[train], y[train])
#             # put the curve in ax through 'ax = ax'
#             viz = plot_roc_curve(classifier, X[test], y[test],
#                                  name='ROC fold {}'.format(i),
#                                  alpha=0.3, lw=1, ax=ax)

#             #Plot every point (500) form 0-1, similiar to bin
#             interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
#             interp_tpr[0] = 0.0
#             #Buff the result 
#             tprs.append(interp_tpr)
#             aucs.append(viz.roc_auc)
           
#         #store the auc value for this conf
#         std = np.std(aucs)
#         aucs.append(std)
#         auc_all_config.append(aucs)
#         #Plot chance
#         ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
#                 label='Chance', alpha=.8)

#         #mean value for each colomn
#         mean_tpr = np.mean(tprs, axis=0)
#         mean_tpr[-1] = 1.0
#         mean_auc = auc(mean_fpr, mean_tpr)
       
#         #store all mean_auc together
#         mean_aucs.append(mean_auc)

#         std_auc = np.std(aucs)
#         #Plot mean
#         ax.plot(mean_fpr, mean_tpr, color='b',
#                 label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
#                 lw=2, alpha=.8)

#         #Configure the diagram 
#         ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
#                title="Receiver operating characteristic example")
#         ax.legend(loc="lower right")
#         file_name = str(conf)+conf_name+str(year)+'.jpg'
#         folder = os.getcwd() + '/NN_cv/'+year+'/'+conf_name
#         if not os.path.exists(folder):
#             os.makedirs(folder)
#         fig.savefig('./NN_cv/'+year+'/'+conf_name+'/'+file_name)
        
#     txt_head = conf_name+'/'.join(str(x) for x in conf_list)
#     file_name_all = './NN_cv/'+year+'/'+conf_name+'/'+str(year)+'all.txt'
#     file_name_mean = './NN_cv/'+year+'/'+conf_name+'/'+str(year)+'mean.txt'
#     file_name_para = './NN_cv/'+year+'/'+conf_name+'/'+'other_para.json'
    
#     folder = os.getcwd() + '/NN_cv/'+year+'/'+conf_name

#     if not os.path.exists(folder):
#         os.makedirs(folder)
        
#     np.savetxt(file_name_all,auc_all_config,fmt='%.7f',delimiter=',', header=txt_head)
#     np.savetxt(file_name_mean,mean_aucs,fmt='%.7f',delimiter=',', header=txt_head)
#     with open(file_name_para, 'w') as fp:
#         json.dump(para, fp, indent=4)

In [10]:
def train_NN_cv(X, y): 
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state=13
    )

    # Set the parameters by cross-validation
    tuned_parameters = [{
        'hidden_layer_sizes': [(5,),(10,),(15,),(5,5)], 
        'activation': ['logistic', 'tanh', 'relu'],           
        'alpha': [0.1, 0.01, 0.001],
        'learning_rate_init':[0.01, 0.001, 0.0001]
    }]

    print("# Tuning hyper-parameters for AUC")

    clf = GridSearchCV(
        MLPClassifier(), 
        tuned_parameters, 
        scoring='roc_auc',
        n_jobs=7,
        verbose=1
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

In [11]:
def train_NN_diff(X, y, conf):
    # train test Random state 13
    time_start=time.time()
    #prepare config list
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state = 13
    )
    
    
    classifier = MLPClassifier(
            hidden_layer_sizes = conf,
            alpha = 0.001,
            learning_rate_init = 0.001,
            tol = 1e-05,
            verbose = True,
            n_iter_no_change = 10
            
    )
    
    classifier.fit(X_train,y_train)
    
    time_end=time.time()
    print('Training done, time cost:',time_end-time_start,'s')
    
    return classifier   

In [12]:
def train_NN(X, y, year):
    # {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}
    # train test Random state 13
    time_start=time.time()
    #prepare config list
    para = dict()
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state = 13
    )
    
    fig, ax = plt.subplots(figsize=(15, 8))    
    
    classifier = MLPClassifier(
            hidden_layer_sizes = (5,),
            activation = 'tanh',
            alpha = 0.001,
            learning_rate_init = 0.001,
            tol = 1e-04,
            verbose = True         
    )
    
    para = classifier.get_params(deep=True)
    folder = os.getcwd() + '/NN/'

    if not os.path.exists(folder):
        os.makedirs(folder)
        
    file_name_para = folder+year+'para.json'    
    with open(file_name_para, 'w') as fp:
        json.dump(para, fp, indent=4)
    
    classifier.fit(X_train,y_train)
    
    viz = plot_roc_curve(
        classifier, 
        X_test, 
        y_test,
        name='Test ROC'.format(0),
        alpha=0.5, lw=1, ax=ax
    )
    
    viz_train = plot_roc_curve(
        classifier, 
        X_train, 
        y_train,
        name='Train ROC'.format(1),
        alpha=0.5, lw=1, ax=ax
    ) 
    
    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)
    
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
       title="Receiver operating characteristic example")
    ax.legend(loc="lower right")
    time_end=time.time()
    print('Training done, time cost:',time_end-time_start,'s')

    plt.show()
    fig.savefig(folder+'AUC_{year}_NN.jpg'.format(year=year_)) 
    
    fig_loss, ax_loss = plt.subplots(figsize=(15, 8)) 
    loss_curve = classifier.loss_curve_
    ax_loss.plot(np.arange(1, len(loss_curve)+1), loss_curve )
    plt.show()
    fig_loss.savefig(folder+'train_curve_{year}_NN.jpg'.format(year=year_)) 
    return classifier   

# Random Forest

In [13]:
def train_random_forest_cv(X, y): 
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state=13
    )

    # Set the parameters by cross-validation
    tuned_parameters = [{
        'n_estimators': [800, 1200], 
        'max_depth': [10, 12, 14],               
        'min_samples_split': [2, 4]
    }]

    print("# Tuning hyper-parameters for AUC")

    clf = GridSearchCV(
        RandomForestClassifier(), 
        tuned_parameters, 
        scoring='roc_auc',
        n_jobs=7,
        verbose=3
    )
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


In [14]:
#{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 800}
def train_random_forest(X, y): 
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state=13
    )
    print("# Tuning hyper-parameters for AUC")

    clf = RandomForestClassifier(
            n_estimators = 800,
            max_depth = 10,
            min_samples_split = 2,
            n_jobs=6,
            verbose = 2
        )
    clf.fit(X_train, y_train)

    return clf


In [15]:
#{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 800}
def train_RF_diff(X, y, conf): 
    # Split the dataset in two equal parts
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=0.3, 
        stratify = y,
        random_state=13
    )

    clf = RandomForestClassifier(
            n_estimators = conf,
            max_depth = 10,
            min_samples_split = 2,
            n_jobs=7,
            verbose = 2
        )
    clf.fit(X_train, y_train)

    return clf


# Main Function

In [16]:
if __name__ == '__main__':
    
    year_list = ['2004','2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015']
    year_list_2004 = ['2007','2008','2009','2010']
    c_list = [0.05, 0.5, 1.0, 5.0, 10.0]
    
    conf_list = [5, 50, 100, 800]
    #model_save_path_NS_NP = './LR_experiments/Penalty_CV/{C}/models/{year}.joblib'
    
    LR_model_save_path_NS_NP = './LR_experiments/N_Penalty/N_Standardized/models/{year}.joblib'
    LR_model_save_path_NS_YP = './LR_experiments/Penalty/N_Standardized/models/{year}.joblib'
    
    LR_model_save_path_YS_NP = './LR_experiments/N_Penalty/Standardized/models/{year}.joblib'
    LR_model_save_path_YS_YP = './LR_experiments/Penalty/Standardized/models/{year}.joblib'
    
    NN_model_save_path = './NN_models_0.01/models/{year}.joblib'
    
    RF_model_save_path = './RF_models/models/{year}.joblib'
    
    NN_Alpha_model_save_path = './NN_experiments/deep/{a}/models/{year}.joblib'
    LR_C_model_save_path = './LR_experiments/Penalty_CV/{C}/models/{year}.joblib'
    RF_Exp_model_save_path = './RF_experiments/Tree/{conf}/models/{year}.joblib'
    
    X_path = 'data_train/{year}/input.csv'
    y_path = 'data_train/{year}/output.csv'

    # Command
    mode_list = ["train_NN_cv"]
    
    
    for mode in mode_list:
        if mode == "data_process":
            for year_ in year_list:
                data_path = 'data/{year}/data'.format(year=year_)
                data_path_time = 'data/{year}/data_time'.format(year=year_)
                data = load_data(data_path, data_path_time)
                data.to_csv('data_flag/{}_flag.csv'.format(year_))

        elif mode == "train_NN_cv":  
            for year_ in year_list_2004:
                print("Start year {}...".format(year_))
                cols = pd.read_csv(X_path.format(year=year_)).columns
                X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                cols = pd.read_csv(y_path.format(year=year_)).columns
                y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                X = X.to_numpy()
                y = np.asarray(y['0'].astype(int))
                
                train_NN_cv(X, y)

        elif mode =="train_LR_N_S": # Train LR without standardize
            for year_ in year_list:
                penalty_list = [0, 1]
                for penalty in penalty_list:
                    print("Loading data...")
                    data_path = 'data_flag/{year}_flag.csv'.format(year=year_)  
                    cols = pd.read_csv(data_path).columns
                    data = pd.read_csv(data_path, usecols = cols[1:]) 

                    fig, ax = plt.subplots(figsize=(15, 8))         
                    print(str(year_)+" training start...")

                    model = train_log_N_S(data, fig, ax, penalty)
                    if penalty == 1: 
                        folder = os.getcwd() + '/LR_experiments/Penalty/N_Standardized/models/'
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        dump(model, LR_model_save_path_NS_YP.format(year=year_))
                    else:
                        folder = os.getcwd() + '/LR_experiments/N_Penalty/N_Standardized/models/'
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        dump(model, LR_model_save_path_NS_NP.format(year=year_))

        elif mode =="train_LR_S": # Train LR using data after standardizing
            for year_ in year_list:
                print("Loading data...")

                cols = pd.read_csv(X_path.format(year=year_)).columns
                X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                cols = pd.read_csv(y_path.format(year=year_)).columns
                y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                X = X.to_numpy()
                y = np.asarray(y['0'].astype(int))

                fig, ax = plt.subplots(figsize=(15, 8))         
                print(str(year_)+" training start...")
                penalty_list = [0, 1]
                for penalty in penalty_list:
                    if penalty == 0:
                        model = train_log_Y_S(X, y, fig, ax, penalty)
                        folder = os.getcwd() + '/LR_experiments/N_Penalty/Standardized/models/'
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        dump(model, LR_model_save_path_YS_NP.format(year=year_))
                    else:
                        model = train_log_Y_S(X, y, fig, ax, penalty)
                        folder = os.getcwd() + '/LR_experiments/Penalty/Standardized/models/'
                        if not os.path.exists(folder):
                            os.makedirs(folder)
                        dump(model, LR_model_save_path_YS_YP.format(year=year_))

        elif mode =="train_NN":
            for year_ in year_list:
                print("Loading data...")
                
                cols = pd.read_csv(X_path.format(year=year_)).columns
                X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                cols = pd.read_csv(y_path.format(year=year_)).columns
                y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                X = X.to_numpy()
                y = np.asarray(y['0'].astype(int))
                        
                print(str(year_)+" training start...")
                model = train_NN(X, y, year_)
                
                dump(model, NN_model_save_path.format(year=year_))
                
        elif mode =="train_RF":
            for year_ in year_list:
                print("{} Loading data...".format(year_))
                
                cols = pd.read_csv(X_path.format(year=year_)).columns
                X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                cols = pd.read_csv(y_path.format(year=year_)).columns
                y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                X = X.to_numpy()
                y = np.asarray(y['0'].astype(int))
                
                model = train_random_forest(X, y)
                dump(model, RF_model_save_path.format(year=year_))
                
        elif mode =="train_RF_cv":
            for year_ in year_list_2004:
                print("Loading data...")
                
                cols = pd.read_csv(X_path.format(year=year_)).columns
                X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                cols = pd.read_csv(y_path.format(year=year_)).columns
                y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                X = X.to_numpy()
                y = np.asarray(y['0'].astype(int))
                
                train_random_forest_cv(X, y)
                

        elif mode =="train_LR_C":
            for year_ in year_list:
                for c in c_list:
                    print("C="+str(c)+"Loading data...")
                    cols = pd.read_csv(X_path.format(year=year_)).columns
                    X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                    cols = pd.read_csv(y_path.format(year=year_)).columns
                    y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                    X = X.to_numpy()
                    y = np.asarray(y['0'].astype(int))       
                    print(str(year_)+" training start...")
                    model = train_log_diff(X, y, c)
                    folder = os.getcwd() + '/LR_experiments/Penalty_CV/{}/models/'.format(c)
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    dump(model, LR_C_model_save_path.format(C=c, year=year_))

        elif mode =="train_NN_alpha":
            for year_ in year_list:
                for conf in conf_list:
                    print("Conf = "+str(conf)+" Loading data...")
                    
                    cols = pd.read_csv(X_path.format(year=year_)).columns
                    X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                    cols = pd.read_csv(y_path.format(year=year_)).columns
                    y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                    X = X.to_numpy()
                    y = np.asarray(y['0'].astype(int))
                    
                    print(str(year_)+" training start...")
                    model = train_NN_diff(X, y, conf)
                    folder = os.getcwd() + '/NN_experiments/deep/{}/models/'.format(conf)
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    dump(model, NN_Alpha_model_save_path.format(a=conf, year=year_))
                    
        elif mode =="train_RF_conf":
            for year_ in year_list:
                for conf in conf_list:
                    print("Conf = "+str(conf)+" Loading data...")
                    
                    cols = pd.read_csv(X_path.format(year=year_)).columns
                    X = pd.read_csv(X_path.format(year=year_), usecols = cols[1:])
                    cols = pd.read_csv(y_path.format(year=year_)).columns
                    y = pd.read_csv(y_path.format(year=year_), usecols = cols[1:]) 
                    X = X.to_numpy()
                    y = np.asarray(y['0'].astype(int))
                    
                    print(str(year_)+" training start...")
                    model = train_RF_diff(X, y, conf)
                    folder = os.getcwd() + '/RF_experiments/Tree/{}/models/'.format(conf)
                    if not os.path.exists(folder):
                        os.makedirs(folder)
                    dump(model, RF_Exp_model_save_path.format(conf=conf, year=year_))          
        
        elif mode =="Standard":
            data_standardize()
         

Start year 2007...
# Tuning hyper-parameters for AUC
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  6.2min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 28.2min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 58.7min
[Parallel(n_jobs=7)]: Done 540 out of 540 | elapsed: 75.4min finished


Best parameters set found on development set:

{'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (15,), 'learning_rate_init': 0.001}

Grid scores on development set:

0.788 (+/-0.002) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.789 (+/-0.004) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}
0.787 (+/-0.004) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.0001}
0.787 (+/-0.004) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.789 (+/-0.004) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001}
0.788 (+/-0.003) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.0001}
0.788 (+/-0.002) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (15,), 'learning_rate_

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  4.8min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 26.1min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 55.2min
[Parallel(n_jobs=7)]: Done 540 out of 540 | elapsed: 70.6min finished


Best parameters set found on development set:

{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (15,), 'learning_rate_init': 0.001}

Grid scores on development set:

0.820 (+/-0.003) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.822 (+/-0.003) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}
0.819 (+/-0.004) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.0001}
0.820 (+/-0.005) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.821 (+/-0.002) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001}
0.820 (+/-0.003) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.0001}
0.820 (+/-0.003) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (15,), 'learning_rate

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  4.7min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 32.2min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 75.5min
[Parallel(n_jobs=7)]: Done 540 out of 540 | elapsed: 99.5min finished


Best parameters set found on development set:

{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001}

Grid scores on development set:

0.727 (+/-0.015) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.733 (+/-0.010) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}
0.740 (+/-0.007) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.0001}
0.723 (+/-0.008) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.734 (+/-0.009) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001}
0.738 (+/-0.012) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.0001}
0.723 (+/-0.012) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (15,), 'learning_rate

[Parallel(n_jobs=7)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  3.8min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 23.2min
[Parallel(n_jobs=7)]: Done 436 tasks      | elapsed: 54.6min
[Parallel(n_jobs=7)]: Done 540 out of 540 | elapsed: 74.7min finished


Best parameters set found on development set:

{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}

Grid scores on development set:

0.721 (+/-0.006) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.01}
0.725 (+/-0.006) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.001}
0.730 (+/-0.006) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (5,), 'learning_rate_init': 0.0001}
0.720 (+/-0.011) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01}
0.725 (+/-0.006) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.001}
0.731 (+/-0.006) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.0001}
0.718 (+/-0.016) for {'activation': 'logistic', 'alpha': 0.1, 'hidden_layer_sizes': (15,), 'learning_rate_

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.99      1.00      1.00    378795
           1       0.00      0.00      0.00      2705

    accuracy                           0.99    381500
   macro avg       0.50      0.50      0.50    381500
weighted avg       0.99      0.99      0.99    381500


