In [None]:

from outputmethods import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, precision_score,roc_auc_score

#pmml wrapper
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml


In [None]:
#pre-defined functions - a collection of codes that I use frequently during model build. 
#This code has been shared by collegues, friends, and stack overflow 

def CreateDecileAnalysis(d, numBins):
    # deciles is a dataframe with the following columns:
    # [PROB, PRED, TARG]
    d = d.copy()
    d['PROB_MIN'] = d['PROB']
    d['PROB_MAX'] = d['PROB']
    d.rename(columns={'PROB': 'PROB_AVG'}, inplace=True)
    d['RECS'] = 1
    d['BINS'] = pd.qcut(d['PROB_MIN'].rank(method = 'first'), numBins, labels=False, duplicates='drop')

    d = d.groupby('BINS', as_index=False).agg(
        {'PROB_MIN': np.min, 'PROB_AVG': np.mean, 'PROB_MAX': np.max,  'TARGET': np.sum, 'RECS': np.sum})
    
    d['PROB_MIN'] = d['PROB_MIN'].round(15)
    d['PROB_AVG'] = d['PROB_AVG'].round(15)
    d['PROB_MAX'] = d['PROB_MAX'].round(15)

    d = d.reindex(columns=['BINS', 'PROB_AVG', 
                           'TARGET',
                           'RECS', 'PROB_MIN', 'PROB_MAX'])
    d = d.sort_values('BINS', ascending=False)
    d['BINS'] = d['BINS'].astype(str)
    # Output: Deciles
    return d

def CreateROCcurve(results):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(results['TARGET'], results['PROB'])
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.figure(2, figsize=(10, 10))
    plt.title('Receiver Operating Characteristic')
    plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([0.0,1.0])
    plt.ylim([0.0,1.0])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

def read_data(path):
    """
    Read the data into pandas dataframe
    """
    data = pd.read_csv(path,encoding = 'iso-8859-1')
    return data

def get_headers(dataset): 
    return dataset.columns.values

#grid search for hyperparameters
def grid_search_hyperparameter():
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 31, stop = 200, num = 55)]
    # Number of features to consider at every split
    max_features = ['sqrt','log2']
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(15, 45, num = 20)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 3,5]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [ 10,20, 30, 40, 50,100,150]
    # Method of selecting samples for training each tree
    bootstrap = [True]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    return random_grid
    
para_grid = grid_search_hyperparameter()
random_grid = grid_search_hyperparameter()

def handle_null_values():
    target = model_data['TARGET']
    target = target.replace('$null$', 0)
    features = model_data.drop('TARGET', axis = 1)
    features = features.replace('$null$', -1)
    return target, features

In [None]:
INPUT_PATH = 'C:/Users/SamarDeen/Desktop/Data/train.csv'
INPUT_PATH_TEST = 'C:/Users/SamarDeen/Desktop/Data/test.csv' 

In [None]:
train = read_data(INPUT_PATH)
test = read_data(INPUT_PATH_TEST)

In [None]:
#view all columns
for c in (train.columns):
    print(c)

In [None]:
#subset features
train_features = train[['A','B','C']]
test_features = m2n_test[['A','B','C']]

In [None]:
train_target  = train['TARGET']
train_target.head()
test_target  = test['TARGET']
test_target.head()
test_target.describe()

In [None]:
pd.set_option('display.max_columns', 1000)
#get statistics
train_features.describe()

In [None]:
# Define the metrics for model selection
scorers = {'recall_score': make_scorer(recall_score)}
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(random_state = 42,criterion = 'gini')
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
refit_score = 'recall_score'
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = 1,scoring = scorers, refit = refit_score,return_train_score = True)
# Fit the random search model
rf_random.fit(train_features_new, train_target_new)
# Hyperparameters for the best performing random forest
rf_random.best_params_
rf_random.cv_results_
rf_random.best_score_
rf_random.scorer_

In [None]:
regressor = RandomForestClassifier(n_estimators = 90 , min_samples_split=2, min_samples_leaf = 2,
                                   max_features = 0.13, max_depth= 12, bootstrap = True, 
                                   criterion = 'entropy', random_state = 42)
regressor.fit(train_features, train_target)
cutoff = 0.001628

In [None]:
# Generate confusion matrix for training set
pro_t = regressor.predict_proba(train_features)
proba_t = pd.DataFrame(pro_t, columns = ['0','1'])
proba_t ['prediction'] = np.where(proba_t['1'] >= cutoff, 1, 0)
conf_train = confusion_matrix(train_target, proba_t['prediction'])
TN = conf_train[0,0]
FP = conf_train[0,1]
FN = conf_train[1,0]
TP = conf_train[1,1]

print(TN , FP , TP , FN)
print('True Negative Rate: ', (TN/(TN+FP)).round(3))
print('True Positive Rate: ', (TP/(TP+FN)).round(3))

In [None]:
#subset and choose columns
mod_data = m2n_train[['TARGET']]
mod_data.head()
proba_t.rename(columns={'1': 'PROB'}, inplace=True)
proba_t.rename(columns={'prediction': 'PRED'}, inplace=True)
proba_t.head()
d = pd.concat([proba_t, mod_data],axis=1)
d.head()

In [None]:
CreateDecileAnalysis(d, 10)

In [None]:
# Area under Curve - ROC Curve
lm_auc = roc_auc_score(d['TARGET'], d['PROB'])
print('\nAUC: ' + str(lm_auc))

In [None]:
CreateROCcurve(d)

In [None]:
feature_list = list(train_features.columns)
# Get numerical feature importance
importances = list(regressor.feature_importances_)
#List of tuples with variable and importance
feature_importances = [(features, importance) for features,importance in zip(feature_list, importances)]
#Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
#Print out the feature and importances
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances]; 

In [None]:
regressor.predict(test_features)
pro = regressor.predict_proba(test_features)
proba = pd.DataFrame(pro, columns = ['0','1'])
proba['prediction'] = np.where(proba['1'] >= cutoff, 1, 0)
conf = confusion_matrix(test_target,proba['prediction']) 
TN = conf[0,0]
FP = conf[0,1]
FN = conf[1,0]
TP = conf[1,1]
print(TN , FP , TP , FN)
print('True Negative Rate: ', (TN/(TN+FP)).round(3))
print('True Positive Rate: ', (TP/(TP+FN)).round(3))

In [None]:
#merge predicted probability of test data with Target
proba.rename(columns={'1': 'PROB'}, inplace=True)
proba.rename(columns={'prediction': 'PRED'}, inplace=True)
proba.head()
test_data = m2n_test[['TARGET']]
d_test = pd.concat([proba, test_data], axis=1)
#d_test.head()
CreateDecileAnalysis(d_test, 10)

In [None]:
#Save data
d_test.to_csv('') 

In [None]:
pipeline = PMMLPipeline([("classifier", regressor)])
pipeline.fit(train,target)
sklearn2pmml(pipeline, "Model.pmml", with_repr = True)