# ***Team Modelling***

In [1]:
# Import modules

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix, make_scorer, matthews_corrcoef, f1_score, classification_report, fbeta_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from Error_analysis import calculate_cost

RSEED= 42

from imblearn.over_sampling import SMOTE


In [2]:
# Import Dataset

df= pd.read_csv('data/training_preprocessed.csv')

In [3]:

y= df.FraudResult
df.drop("FraudResult", axis=1, inplace=True)
X= df
X.drop(["TransactionId"], axis=1, inplace=True)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=RSEED)
X_test_unscaled = X_test.copy()


## SMOTE-ing

In [5]:
smotifier = SMOTE(random_state=RSEED)


In [6]:
X_smote, y_smote = smotifier.fit_resample(X_train, y_train)
X_smote = pd.DataFrame(data= X_smote, columns=X_train.columns)
y_smote = pd.DataFrame(data= y_smote, columns=["FraudResult"])
smote_df_combined = X_smote.copy()
smote_df_combined["FraudResult"] = y_smote["FraudResult"]

In [7]:
smote_df_combined.groupby("FraudResult").count()

Unnamed: 0_level_0,Year,Month,Week,Day,is_workday,is_worktime,SignAmount,ModAmount,ProviderId_1,ProviderId_2,...,PricingStrategy_2,PricingStrategy_4,batch_size,total_transactions_by_customer,transactions_by_customer_this_month,transactions_by_customer_this_week,transactions_by_customer_this_day,day_vs_week,day_vs_month,week_vs_month
FraudResult,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601,...,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601
1,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601,...,71601,71601,71601,71601,71601,71601,71601,71601,71601,71601


Simple forest, no scaling

In [8]:
rf = RandomForestClassifier(random_state=RSEED)
rf.fit(X_smote, y_smote)


  rf.fit(X_smote, y_smote)


In [9]:
y_pred = rf.predict(X_test)

In [10]:
print(confusion_matrix(y_test, y_pred))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred),3)}")
print(f"MCC is:{round(matthews_corrcoef(y_test, y_pred),3)}")
calculate_cost(X_test_unscaled, y_test, y_pred)

[[23857    11]
 [   16    32]]
----------------------------------------
Recall is:0.667
F1 is:0.703
MCC is:0.704
You detected 32 fraudulent transactions
You saved the company 84.82 Million UGX
You missed 16 fraudulent transactions
The company now has to reimburse frauds with a total of 13.0 Million UGX
Total money saved is 71.82 Million UGX
You incorrectly flagged 11 legit transactions as fraudulent


Scaling

In [11]:
stdsc = StandardScaler()
X_train['ModAmount'] = stdsc.fit_transform(pd.DataFrame(X_train['ModAmount']))
X_test['ModAmount'] = stdsc.transform(pd.DataFrame(X_test['ModAmount']));
X_smote['ModAmount'] = stdsc.fit_transform(pd.DataFrame(X_smote['ModAmount']))
X_smote['ModAmount'] = stdsc.transform(pd.DataFrame(X_smote['ModAmount']));

In [12]:
rf = RandomForestClassifier()
rf.fit(X_smote, y_smote)
y_pred = rf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred),3)}")
print(f"MCC is:{round(matthews_corrcoef(y_test, y_pred),3)}")
calculate_cost(X_test_unscaled, y_test, y_pred)


  rf.fit(X_smote, y_smote)


[[23484   384]
 [   20    28]]
----------------------------------------
Recall is:0.583
F1 is:0.122
MCC is:0.195
You detected 28 fraudulent transactions
You saved the company 76.04 Million UGX
You missed 20 fraudulent transactions
The company now has to reimburse frauds with a total of 21.78 Million UGX
Total money saved is 54.26 Million UGX
You incorrectly flagged 384 legit transactions as fraudulent


In [25]:
"""
param_grid = {'n_estimators': [150,200],
                'max_depth': [10, None], #+ list(np.linspace(3, 100, 5).astype(int)),
                'max_features': ['sqrt'] + list(np.arange(0.5, 0.75, 0.25)),
                'max_leaf_nodes': [20, 50, 75],
                'min_samples_split': [2, 5, 10]
                    }

scorer = make_scorer(matthews_corrcoef)

rand_grid = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, scoring="roc_auc", verbose=5)"""

In [26]:
#rand_grid.fit(X_smote, y_smote)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 2/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=150;, score=1.000 total time=   5.7s
[CV 3/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=150;, score=1.000 total time=   6.1s
[CV 5/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=150;, score=0.975 total time=   5.9s
[CV 1/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=150;, score=1.000 total time=   6.4s
[CV 4/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=150;, score=0.983 total time=   6.2s
[CV 2/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=200;, score=1.000 total time=   8.3s
[CV 1/5] END max_depth=10, max_features=sqrt, max_leaf_nodes=20, min_samples_split=2, n_estimators=200;, score=1.000 total

In [27]:
rand_grid.best_params_
"""{'max_depth': 10,
 'max_features': 0.5,
 'max_leaf_nodes': 50,
 'min_samples_split': 10,
 'n_estimators': 150}"""

{'max_depth': 10,
 'max_features': 0.5,
 'max_leaf_nodes': 50,
 'min_samples_split': 10,
 'n_estimators': 150}

In [13]:
rf_best = RandomForestClassifier(max_depth= 10, max_features = 0.5, max_leaf_nodes = 50, min_samples_split = 10, n_estimators = 150)
rf_best.fit(X_smote, y_smote)
y_pred_best = rf_best.predict(X_test)
print(confusion_matrix(y_test, y_pred_best))
print("----" * 10)
print(f"Recall is:{round(recall_score(y_test, y_pred_best),3)}")
print(f"F1 is:{round(f1_score(y_test, y_pred_best),3)}")
print(f"MCC is:{round(matthews_corrcoef(y_test, y_pred_best),3)}")
calculate_cost(X_test_unscaled, y_test, y_pred_best)


  rf_best.fit(X_smote, y_smote)


[[22515  1353]
 [   12    36]]
----------------------------------------
Recall is:0.75
F1 is:0.05
MCC is:0.133
You detected 36 fraudulent transactions
You saved the company 84.27 Million UGX
You missed 12 fraudulent transactions
The company now has to reimburse frauds with a total of 13.54 Million UGX
Total money saved is 70.72 Million UGX
You incorrectly flagged 1353 legit transactions as fraudulent
