In [1]:
import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint,History
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score,classification_report,precision_score, recall_score
from sklearn.metrics import f1_score, matthews_corrcoef,confusion_matrix

In [11]:
def load_data(data_url):
    print("Loading data ...",end=" ")
    df = pd.read_excel(data_url)
    #df.drop(['V7_day','V6_day'], axis=1, inplace=True)
    X = df.drop("CLASS", axis=1)
    y = df["CLASS"]
    X = X.values
    y = y.values
    print('\033[32m \u2713 \033[0m')
    return X,y

def split_data(X,y):
    print("Split data ...",end=" ")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('\033[32m \u2713 \033[0m')
    return X_train, X_test, y_train, y_test

def save_model(model,name):
    print("Saving "+name+" model ...",end=" ")
    pickle.dump(model, open("./models/"+name+".pkl","wb"))
    print('\033[32m \u2713 \033[0m')

def print_metrics(y_test,y_pred):
    n_errors = (y_pred != y_test).sum()
    #print("The model used is Decision Tree  classifier")

    acc = accuracy_score(y_test, y_pred)
    print("The accuracy is {}".format(acc))
    
    prec = precision_score(y_test, y_pred)
    print("The precision is {}".format(prec))
    
    rec = recall_score(y_test, y_pred)
    print("The recall is {}".format(rec))
    
    f1 = f1_score(y_test, y_pred)
    print("The F1-Score is {}".format(f1))

def confusion_matr(y_test,y_pred):
    # printing the confusion matrix
    LABELS = ['Normal', 'Fraud']
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize =(8, 4))
    sns.heatmap(conf_matrix, xticklabels = LABELS,yticklabels = LABELS, annot = True, fmt ="d")
    plt.title("Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

def d_tree(X_train, X_test, y_train, y_test):
    print("Creating DT model ...",end=" ")
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')

    y_pred = clf.predict(X_test)
    print_metrics(y_test,y_pred)
    confusion_matr(y_test,y_pred)

    return clf

def r_forest(X_train, X_test, y_train, y_test):
    print("Creating RF model ...",end=" ")
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')

    y_pred = rfc.predict(X_test)
    print_metrics(y_test,y_pred)
    confusion_matr(y_test,y_pred)
    
    return rfc

def ex_tree(X_train, X_test, y_train, y_test):
    print("Creating Extra_T model ...",end=" ")
    etc = ExtraTreesClassifier(n_estimators=100, max_depth=4)
    etc.fit(X_train, y_train)
    print('\033[32m \u2713 \033[0m')
    y_pred = etc.predict(X_test)
    print_metrics(y_test,y_pred)
    confusion_matr(y_test,y_pred)
    return etc

num_classes=63

def cnn_md(X,y):
    
    X_train, X_test, y_train, y_test = split_data(X,y)
    
    print("Creating CNN model ...",end=" ")

    # Define the CNN model
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Reshape((X.shape[1], 1), input_shape=(X.shape[1],)  ))
    model.add(tf.keras.layers.Conv1D(32, 2, activation='relu'))
    model.add(tf.keras.layers.MaxPooling1D(2))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
    
    
    print('\033[32m \u2713 \033[0m')  

    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history= History()
    
    print("Training step ...")
    # Train the model
    history= model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))
    print('\033[32m \u2713 \033[0m')  

    # Evaluate the model on the test data
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print("Test Accuracy:", test_accuracy)

    # Make predictions on the test data
    # y_pred = model.predict(X_test)
    # confusion_matr(y_test,y_pred) 

    return history,model

def ensemble_learning(X_train, X_test, y_train, y_test,mds):
    dt,df=mds
    # Combine the models into an ensemble
    ensemble_clf = VotingClassifier(estimators=[("dt", dt), ("rf", rd)], voting='hard')
    ensemble_clf.fit(X_train, y_train)
    y_pred = ensemble_clf.predict(X_test)


In [32]:
def load_data(data_url):
    print("Loading data 2 ...",end=" ")
    df = pd.read_excel(data_url)
    #df.drop(['V7_day','V6_day'], axis=1, inplace=True)
    X = df.drop("CLASS", axis=1)
    y = df["CLASS"]
    print('\033[32m \u2713 \033[0m')
    return X,y

In [158]:
data_url='C:/Users/KABYADE/Desktop/Fraud_ML/dataset/preprocessing_data.xlsx'
X,y=load_data(data_url)

Loading data 2 ... [32m ✓ [0m


In [159]:
for i  in range(len(y)):
    y[i]=y[i]-1


In [161]:
data = X.values # list of sequences
y = tf.keras.utils.to_categorical(y,len(np.unique(y)))
labels = y # list of class labels

In [162]:
from sklearn.preprocessing import StandardScaler

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)
# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [167]:
print("Creating CNN model ...",end=" ")
# Define the CNN model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(32, 3, input_shape = (36,1), activation='relu'))
model.add(tf.keras.layers.MaxPooling1D(2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(63, activation='softmax'))
model.summary()

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history= History()

Creating CNN model ... Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_38 (Conv1D)          (None, 34, 32)            128       
                                                                 
 max_pooling1d_37 (MaxPoolin  (None, 17, 32)           0         
 g1D)                                                            
                                                                 
 flatten_32 (Flatten)        (None, 544)               0         
                                                                 
 dense_85 (Dense)            (None, 64)                34880     
                                                                 
 dense_86 (Dense)            (None, 63)                4095      
                                                                 
Total params: 39,103
Trainable params: 39,103
Non-trainable params: 0
__________________________

In [168]:
print("Training step ...")
# Train the model
history= model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))
print('\033[32m \u2713 \033[0m') 

Training step ...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
[32m ✓ [0m


In [None]:
from keras.preprocessing.sequence import TimeseriesGenerator

X_train, X_test, y_train, y_test = split_data(X,y)
print(X_train.shape)

data=X
labels=y

# create a time series generator
generator = TimeseriesGenerator(data, labels,batch_size=32)

In [None]:
cnn=cnn_md(X,y)

In [33]:
#Decision_Tree
dt=d_tree(X_train, X_test, y_train, y_test)

Creating DT model ... [32m ✓ [0m
The accuracy is 0.7665


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [34]:
rd=r_forest(X_train, X_test, y_train, y_test)

Creating RF model ... [32m ✓ [0m
The accuracy is 0.7601


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [None]:
save_model(rd,"R_forest")

In [None]:
test_data=[  4,  2,  23,  1,  410878,  715848.0,  19,  2976,  44,  20339,  3,  1,  621524,  1,  
             384,  4,  0.654861,  0.654861, -1.0, -0.5,  0.959493, -0.978148,  2009,  2011,  2009,  2009 
           ]
to_predict = np.array(test_data).reshape(1,26)

rf_model = pickle.load(open("models/R_forest.pkl","rb"))
rf_model.predict(to_predict) 