# ***Team Modelling with SMOTE*** 

In [1]:
# Import modules

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, matthews_corrcoef, f1_score, classification_report, fbeta_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import Error_analysis
from imblearn.over_sampling import SMOTE

RSEED= 42




In [2]:
def combine_dfs(X_test_unscaled, y_test, y_pred):
    test_comb = X_test_unscaled.copy()
    test_comb["y_true"] = y_test
    test_comb["y_pred"] = y_pred
    return test_comb

def get_classification_dfs(X_test_unscaled, y_test, y_pred):
    '''Combines the dataframes and returns all TN, FP, FN and TP'''
    test_comb = combine_dfs(X_test_unscaled, y_test, y_pred)
    false_positive = test_comb[test_comb.y_true < test_comb.y_pred]
    false_negative = test_comb[test_comb.y_true > test_comb.y_pred]
    correct = test_comb[test_comb.y_true == test_comb.y_pred]
    true_positive = correct[correct.y_pred == 1]
    true_negative = correct[correct.y_pred == 0]
    return true_negative, false_positive, false_negative, true_positive

def calculate_cost(X_test_unscaled, y_test, y_pred):
    '''Calculates the money saved by the model'''
    true_negative, false_positive, false_negative, true_positive = get_classification_dfs(X_test_unscaled, y_test, y_pred)
    reimbursements = false_negative[false_negative.SignAmount == 1].ModAmount.sum()
    avoided_reimbursements = true_positive[true_positive.SignAmount == 1].ModAmount.sum()
    print(f"You detected {true_positive.shape[0]} fraudulent transactions")
    print(f"You saved the company {round((avoided_reimbursements)/1e6, 2)} Million UGX")
    print(f"You missed {false_negative.shape[0]} fraudulent transactions")
    print(f"The company now has to reimburse frauds with a total of {round((reimbursements)/1e6, 2)} Million UGX")
    print(f"Total money saved is {round((avoided_reimbursements - reimbursements)/1e6, 2)} Million UGX")
    print(f"You incorrectly flagged {false_positive.shape[0]} legit transactions as fraudulent")

In [3]:
# Import Dataset

df= pd.read_csv('data/training_preprocessed.csv')

In [4]:

y= df.FraudResult
df.drop("FraudResult", axis=1, inplace=True)
X= df
X.drop(["TransactionId"], axis=1, inplace=True)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)
X_test_unscaled = X_test.copy()


## SMOTE-ing

In [6]:
smotifier = SMOTE(sampling_strategy= 0.1, random_state=RSEED, )


In [7]:

X_smote, y_smote = smotifier.fit_resample(X_train, y_train)
X_smote = pd.DataFrame(data= X_smote, columns=X_train.columns)
y_smote = pd.Series(data= y_smote, name="FraudResult")
smote_df_combined = X_smote.copy()
smote_df_combined["FraudResult"] = y_smote

In [8]:
smote_df_combined.groupby("FraudResult").count()

Unnamed: 0_level_0,Year,Month,Week,Day,is_workday,is_worktime,SignAmount,ModAmount,ProviderId_1,ProviderId_2,...,PricingStrategy_2,PricingStrategy_4,batch_size,total_transactions_by_customer,transactions_by_customer_this_month,transactions_by_customer_this_week,transactions_by_customer_this_day,day_vs_week,day_vs_month,week_vs_month
FraudResult,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601,...,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601
1,7160,7160,7160,7160,7160,7160,7160,7160,7160,7160,...,7160,7160,7160,7160,7160,7160,7160,7160,7160,7160


Simple forest, no scaling

In [9]:
rf = RandomForestClassifier(random_state=RSEED)
rf.fit(X_smote, y_smote)
y_pred = rf.predict(X_test)

In [10]:
print(confusion_matrix(y_test, y_pred))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred),3)}")
print(f"F2 is: {round(fbeta_score(y_test, y_pred, beta=2),3)}")
print("----" * 10)
calculate_cost(X_test_unscaled, y_test, y_pred)

[[23858    10]
 [   15    33]]
----------------------------------------
Recall is:0.688
F1 is:0.725
F2 is: 0.702
----------------------------------------
You detected 33 fraudulent transactions
You saved the company 86.82 Million UGX
You missed 15 fraudulent transactions
The company now has to reimburse frauds with a total of 11.0 Million UGX
Total money saved is 75.82 Million UGX
You incorrectly flagged 10 legit transactions as fraudulent


Scaling

In [11]:
minmax = MinMaxScaler()
stdsc = StandardScaler()
X_smote['ModAmount'] = stdsc.fit_transform(pd.DataFrame(X_smote['ModAmount']))
X_smote['ModAmount'] = stdsc.transform(pd.DataFrame(X_smote['ModAmount']));

In [12]:
rf = RandomForestClassifier(random_state=RSEED)
rf.fit(X_smote, y_smote)
y_pred = rf.predict(X_test)

In [13]:
print(confusion_matrix(y_test, y_pred))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred),3)}")
print(f"F2 is: {round(fbeta_score(y_test, y_pred, beta=2),3)}")
print("----" * 10)
calculate_cost(X_test_unscaled, y_test, y_pred)

[[19144  4724]
 [    8    40]]
----------------------------------------
Recall is:0.833
F1 is:0.017
F2 is: 0.04
----------------------------------------
You detected 40 fraudulent transactions
You saved the company 90.56 Million UGX
You missed 8 fraudulent transactions
The company now has to reimburse frauds with a total of 7.26 Million UGX
Total money saved is 83.3 Million UGX
You incorrectly flagged 4724 legit transactions as fraudulent


In [14]:
rf = RandomForestClassifier(random_state=RSEED, max_depth = None, max_features ='sqrt', max_leaf_nodes= None, min_samples_split= 5, n_estimators= 100)
rf.fit(X_smote, y_smote)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred),3)}")
print(f"F2 is: {round(fbeta_score(y_test, y_pred, beta=2),3)}")
print("----" * 10)
calculate_cost(X_test_unscaled, y_test, y_pred)

[[19226  4642]
 [    6    42]]
----------------------------------------
Recall is:0.875
F1 is:0.018
F2 is: 0.043
----------------------------------------
You detected 42 fraudulent transactions
You saved the company 92.0 Million UGX
You missed 6 fraudulent transactions
The company now has to reimburse frauds with a total of 5.82 Million UGX
Total money saved is 86.18 Million UGX
You incorrectly flagged 4642 legit transactions as fraudulent


rf = RandomForestClassifier(random_state=RSEED, max_depth = None, max_features ='sqrt', max_leaf_nodes= None, min_samples_split= 5, n_estimators= 100)
SMOTE strategy 1 => Recall is:0.625, F1 is:0.106

In [15]:
param_grid = {'n_estimators': np.linspace(50, 150, 3).astype(int),
                'max_depth': [None] + list(np.linspace(3, 100, 3).astype(int)),
                'max_features': ['sqrt'] + list(np.arange(0.5, 1, 0.25)),
                'max_leaf_nodes': list(np.linspace(20, 100, 3).astype(int)) + [None],
                'min_samples_split':[2,5,10]
                }

scorer = make_scorer(matthews_corrcoef)
rand_grid = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, scoring=scorer, verbose=5)

In [16]:
rand_grid.fit(X_smote, y_smote)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV 1/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=50;, score=0.956 total time=   1.4s
[CV 2/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=50;, score=0.960 total time=   1.5s
[CV 3/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=50;, score=0.953 total time=   1.5s
[CV 5/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=50;, score=0.958 total time=   1.6s
[CV 4/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=50;, score=0.959 total time=   1.6s
[CV 1/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=100;, score=0.954 total time=   2.9s
[CV 2/5] END max_depth=None, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=100;, score=

In [17]:
print(rand_grid.best_params_)
print(rand_grid.best_score_)

y_pred_grid = rand_grid.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred_grid))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred_grid),3)}")
print(f"MCC is: {round(matthews_corrcoef(y_test, y_pred_grid),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred_grid),3)}")
print(f"F2 is: {round(fbeta_score(y_test, y_pred_grid, beta=2),3)}")
print("----" * 10)
calculate_cost(X_test_unscaled, y_test, y_pred_grid)

{'max_depth': None, 'max_features': 0.75, 'max_leaf_nodes': None, 'min_samples_split': 5, 'n_estimators': 150}
0.9972426078738141
[[ 8010 15858]
 [    7    41]]
----------------------------------------
Recall is:0.854
MCC is: 0.018
F1 is:0.005
F2 is: 0.013
----------------------------------------
You detected 41 fraudulent transactions
You saved the company 90.0 Million UGX
You missed 7 fraudulent transactions
The company now has to reimburse frauds with a total of 7.82 Million UGX
Total money saved is 82.18 Million UGX
You incorrectly flagged 15858 legit transactions as fraudulent


In [34]:
"""{'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'min_samples_split': 5, 'n_estimators': 100}
0.9995806613048874"""


"""[[23380   488]
 [   18    30]]
----------------------------------------
Recall is:0.625
F1 is:0.106
F2 is: 0.211
----------------------------------------
You detected 30 fraudulent transactions
You saved the company 77.98 Million UGX
You missed 18 fraudulent transactions
The company now has to reimburse frauds with a total of 19.84 Million UGX
Total money saved is 58.14 Million UGX
You incorrectly flagged 488 legit transactions as fraudulent"""

[[23380   488]
 [   18    30]]
----------------------------------------
Recall is:0.625
F1 is:0.106
F2 is: 0.211
----------------------------------------
You detected 30 fraudulent transactions
You saved the company 77.98 Million UGX
You missed 18 fraudulent transactions
The company now has to reimburse frauds with a total of 19.84 Million UGX
Total money saved is 58.14 Million UGX
You incorrectly flagged 488 legit transactions as fraudulent


In [None]:
"""{'max_depth': 10,
 'max_features': 0.5,
 'max_leaf_nodes': 50,
 'min_samples_split': 10,
 'n_estimators': 150}"""

Good_model = RandomForestClassifier(max_depth=10, max_features=0.5, max_leaf_nodes=50, min_samples_split=10, n_estimators=150)

In [None]:
Good_model = RandomForestClassifier(max_depth=10, max_features=0.5, max_leaf_nodes=50, min_samples_split=10, n_estimators=150)
Good_model.fit(X_train, y_train)
y_pred_new = Good_model.predict(X_test)


In [None]:
Good_model = RandomForestClassifier(max_depth=10, max_features=0.5, max_leaf_nodes=50, min_samples_split=10, n_estimators=150)
Good_model.fit(X_train, y_train)
y_pred_new = Good_model.predict(X_test)


from Error_analysis import calculate_cost #maybe has to be updated to show some numbers

print(confusion_matrix(y_test, y_pred_new))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred_new),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred_new),3)}")
print(f"MCC is:{round(matthews_corrcoef(y_test, y_pred_new),3)}")
print("----" * 10)
calculate_cost(X_test_unscaled, y_test, y_pred_new)

"""[[23864     4]
 [    9    39]]
----------------------------------------
Recall is:0.812
F1 is:0.857
MCC is:0.858
----------------------------------------
You detected 39 fraudulent transactions
You saved the company 88.02 Million UGX
You missed 9 fraudulent transactions
The company now has to reimburse frauds with a total of 9.8 Million UGX
Total money saved is 78.22 Million UGX
You incorrectly flagged 4 legit transactions as fraudulent"""

[[23864     4]
 [    9    39]]
----------------------------------------
Recall is:0.812
F1 is:0.857
MCC is:0.858
----------------------------------------
You detected 39 fraudulent transactions
You saved the company 88.02 Million UGX
You missed 9 fraudulent transactions
The company now has to reimburse frauds with a total of 9.8 Million UGX
Total money saved is 78.22 Million UGX
You incorrectly flagged 4 legit transactions as fraudulent


In [None]:
from Error_analysis import get_classification_dfs
TN, FP, FN, TP = get_classification_dfs(X_test_unscaled, y_test, y_pred_new)


ImportError: cannot import name 'get_classification_dfs' from 'Error_analysis' (/Users/valentin/neuefische/Machine-Learning-Project_Xente-Fraud-Detection/Error_analysis.py)

In [None]:
confusion_matrix(y_test, y_pred_grid)

array([[23867,     1],
       [   10,    38]])

In [None]:
calculate_cost(X_test_unscaled, y_test, y_pred_grid)

You saved the company 86.02 Million UGX
The company still has to reimburse frauds with a total of 11.8 Million UGX
Total money saved is 74.22 Million UGX


In [None]:
#result of long gridsearch , stdscaled and MCC optimized

"""{'max_features': 0.75,
 'max_leaf_nodes': 80,
 'min_samples_split': 5,
 'n_estimators': 125}

 Recall is:0.792
MCC is:0.878

You saved the company 86.02 Million UGX
The company still has to reimburse frauds with a total of 11.8 Million UGX
Total money saved is 74.22 Million UGX""";

In [None]:
#with stdscaler and MCC optimization 

"""param_grid = {'n_estimators': #np.linspace(100, 200, 5).astype(int),
                    'max_depth': [None] + list(np.linspace(3, 100, 10).astype(int)),
                    'max_features': [0.75], ['sqrt']+ list(np.arange(0.5, 1, 0.25)),
                    'max_leaf_nodes': list(np.linspace(80, 100, 5).astype(int)) + [None],
                    'min_samples_split':[2, 5, 10, 50]
                    }

scorer = make_scorer(matthews_corrcoef)

rand_grid = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, scoring=scorer, verbose=5)

Recall is:0.7916666666666666
ROC-AUC is:0.895812384782973
F1 is:0.8735632183908045
F-beta is:0.9313725490196079
MCC is:0.8780632982539146

{'max_features': 0.75,
 'max_leaf_nodes': 100,
 'min_samples_split': 5,
 'n_estimators': 125}""";

In [None]:
# with minmax scaling and smote
"""{'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': 50,
 'min_samples_split': 5,
 'n_estimators': 150}

 [[23862     6]
 [   22    26]]
----------------------------------------
Recall is:0.542
F1 is:0.65
MCC is:0.663
You saved the company 74.78 Million UGX
The company still has to reimburse frauds with a total of 23.04 Million UGX
Total money saved is 51.74 Million UGX""";