In [3]:
import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint,History
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score,classification_report,precision_score, recall_score
from sklearn.metrics import f1_score, matthews_corrcoef,confusion_matrix

In [13]:
def load_data(data_url):
    print("Loading data ...",end=" ")
    df = pd.read_excel(data_url)
    #df.drop(['V7_day','V6_day'], axis=1, inplace=True)
    X = df.drop("CLASS", axis=1)
    y = df["CLASS"]
    X = X.values
    y = y.values
    print('\033[32m \u2713 \033[0m')
    return X,y

def split_data(X,y):
    print("Split data ...",end=" ")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('\033[32m \u2713 \033[0m')
    return X_train, X_test, y_train, y_test

def save_model(model,name):
    print("Saving "+name+" model ...",end=" ")
    pickle.dump(model, open("./models/"+name+".pkl","wb"))
    print('\033[32m \u2713 \033[0m')

def print_metrics(y_test,y_pred): 
    """n_errors = (y_pred != y_test).sum()
    acc = accuracy_score(y_test, y_pred)
    print("The accuracy is {}".format(acc))
    prec = precision_score(y_test, y_pred)
    print("The precision is {}".format(prec))
    rec = recall_score(y_test, y_pred)
    print("The recall is {}".format(rec))
    f1 = f1_score(y_test, y_pred)
    print("The F1-Score is {}".format(f1))"""
    print("")
    print(classification_report(y_test,y_pred))

def confusion_matr(y_test,y_pred):
    # printing the confusion matrix
    LABELS = ['Normal', 'Fraud']
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize =(8, 4))
    sns.heatmap(conf_matrix, xticklabels = LABELS,yticklabels = LABELS, annot = True, fmt ="d")
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

def d_tree(X_train, X_test, y_train, y_test):
    print("Creating DT model ...",end=" ")
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')

    y_pred = clf.predict(X_test)
    print_metrics(y_test,y_pred)
    return clf

def r_forest(X_train, X_test, y_train, y_test):
    print("Creating RF model ...",end=" ")
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')

    y_pred = rfc.predict(X_test)
    print_metrics(y_test,y_pred)
    #print(classification_report(y_test,y_pred))
    #confusion_matr(y_test,y_pred)
    
    return rfc

def ex_tree(X_train, X_test, y_train, y_test):
    print("Creating Extra_T model ...",end=" ")
    etc = ExtraTreesClassifier(n_estimators=100, max_depth=4)
    etc.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')
    y_pred = etc.predict(X_test)
    print_metrics(y_test,y_pred)
    #confusion_matr(y_test,y_pred)
    return etc


In [5]:
data_url='C:/Users/KABYADE/Desktop/Fraud_ML/dataset/preprocessing_data.xlsx'
X,y=load_data(data_url)
X_train, X_test, y_train, y_test=split_data(X,y)
X_train.shape,X_test.shape

Loading data ... [32m ✓ [0m
Split data ... [32m ✓ [0m


((79999, 36), (20000, 36))

In [7]:
data_url='C:/Users/KABYADE/Desktop/Fraud_ML/dataset/preprocessing_data.xlsx'
df = pd.read_excel(r'C:\Users\KABYADE\Desktop\Fraud_ML\dataset\preprocessing_data.xlsx')

99999

In [14]:
ext=ex_tree(X_train, X_test, y_train, y_test)

Creating Extra_T model ... [32m ✓ [0m

              precision    recall  f1-score   support

           1       0.46      0.50      0.48       303
           2       1.00      0.66      0.79       322
           3       1.00      1.00      1.00       332
           4       0.36      0.41      0.38       326
           5       0.94      0.45      0.61       322
           6       0.25      0.06      0.09       320
           7       0.75      0.01      0.02       325
           8       0.68      0.81      0.74       339
           9       1.00      0.78      0.88       327
          10       1.00      1.00      1.00       312
          11       1.00      0.46      0.63       364
          12       0.36      0.41      0.38       313
          13       0.34      0.12      0.17       313
          14       0.35      1.00      0.52       338
          15       0.82      1.00      0.90       296
          16       0.13      0.03      0.05       307
          17       0.14      0.69      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
#Decision_Tree
dt=d_tree(X_train, X_test, y_train, y_test)

Creating DT model ... [32m ✓ [0m

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       303
           2       1.00      1.00      1.00       322
           3       1.00      1.00      1.00       332
           4       0.34      0.35      0.35       326
           5       0.99      1.00      1.00       322
           6       0.34      0.36      0.35       320
           7       0.53      0.52      0.52       325
           8       1.00      1.00      1.00       339
           9       1.00      1.00      1.00       327
          10       1.00      1.00      1.00       312
          11       0.59      0.59      0.59       364
          12       0.78      0.78      0.78       313
          13       0.30      0.27      0.29       313
          14       1.00      1.00      1.00       338
          15       1.00      1.00      1.00       296
          16       0.52      0.54      0.53       307
          17       1.00      1.00      1.00  

In [None]:
rd=r_forest(X_train, X_test, y_train, y_test)

In [None]:
save_model(ext,"Ex_Trees")

In [None]:
test_data=[  4,  2,  23,  1,  410878,  715848.0,  19,  2976,  44,  20339,  3,  1,  621524,  1,  
             384,  4,  0.654861,  0.654861, -1.0, -0.5,  0.959493, -0.978148,  2009,  2011,  2009,  2009 
           ]
to_predict = np.array(test_data).reshape(1,26)

rf_model = pickle.load(open("models/R_forest.pkl","rb"))
rf_model.predict(to_predict) 