In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')

In [11]:
df = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/train_non_embedded.csv', index_col=None)

In [12]:
test = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/test_non_embedded.csv', index_col=None)

In [13]:
X_train = df.drop('fraudulent',axis = 1)
y_train = df[['fraudulent']]
X_test = test.drop('fraudulent',axis = 1)
y_test = test[['fraudulent']]

<h1> LightGBM


In [None]:
classifier_lgbm = LGBMClassifier(random_state=0,objective='binary')
# train
classifier_lgbm.fit(X_train, y_train)
# predict
y_pred_lgbm = classifier_lgbm.predict(X_test)

[LightGBM] [Info] Number of positive: 685, number of negative: 13619
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10963
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802


In [None]:
accuracy = accuracy_score(y_test, y_pred_lgbm)
precision = precision_score(y_test, y_pred_lgbm)
recall = recall_score(y_test, y_pred_lgbm)
f1 = f1_score(y_test, y_pred_lgbm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9804250559284117
Precision: 0.9911504424778761
Recall: 0.6187845303867403
F1: 0.7619047619047619


<h1>GaussianNB

In [None]:
classifier_gnb = GaussianNB()
# train
classifier_gnb.fit(X_train, y_train)
# predict
y_pred_gnb = classifier_gnb.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_gnb)
precision = precision_score(y_test, y_pred_gnb)
recall = recall_score(y_test, y_pred_gnb)
f1 = f1_score(y_test, y_pred_gnb)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8375279642058165
Precision: 0.21014492753623187
Recall: 0.8011049723756906
F1: 0.3329506314580941


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create the logistic regression model with class_weight='balanced'
logistic_model = LogisticRegression(class_weight='balanced')

# Fit the model on the training data
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9110738255033557
Precision: 0.33573141486810554
Recall: 0.7734806629834254
F1 Score: 0.468227424749164


<h1> After Adding Embedded Features

In [3]:
df = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/combined_train.csv', index_col=None)

In [4]:
test = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/combined_test.csv', index_col=None)

In [5]:
feature_importance = pd.read_csv('/content/drive/My Drive/BT4012 Group 06/Data/feature_importance.csv', index_col=None)

In [6]:
top_200_features = feature_importance[0:200]

In [7]:
X_train = df.drop('fraudulent',axis = 1)[top_200_features['Feature'].tolist()]
y_train = df[['fraudulent']]
X_test = test.drop('fraudulent',axis = 1)[top_200_features['Feature'].tolist()]
y_test = test[['fraudulent']]

In [None]:
# Calculate the correlation between each feature and the target
correlation_with_target = X_train.apply(lambda col: col.corr(y_train['fraudulent']))
correlation_df = pd.DataFrame(list(correlation_with_target.items()), columns=['Feature', 'Correlation'])
correlation_df['abs_correlation'] = correlation_df['Correlation'].abs()
# Sort the DataFrame by correlation values in descending order
correlation_df = correlation_df.sort_values(by='abs_correlation', ascending=False)

In [None]:
X_train = df.drop('fraudulent',axis = 1)[correlation_df[0:200]['Feature'].tolist()]
y_train = df[['fraudulent']]
X_test = test.drop('fraudulent',axis = 1)[correlation_df[0:200]['Feature'].tolist()]
y_test = test[['fraudulent']]

<h1>LightGBM

In [None]:
classifier_lgbm = LGBMClassifier(random_state=0,objective='binary')
# train
classifier_lgbm.fit(X_train, y_train)
# predict
y_pred_lgbm = classifier_lgbm.predict(X_test)

[LightGBM] [Info] Number of positive: 685, number of negative: 13619
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 45327
[LightGBM] [Info] Number of data points in the train set: 14304, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047889 -> initscore=-2.989802
[LightGBM] [Info] Start training from score -2.989802


In [None]:
accuracy = accuracy_score(y_test, y_pred_lgbm)
precision = precision_score(y_test, y_pred_lgbm)
recall = recall_score(y_test, y_pred_lgbm)
f1 = f1_score(y_test, y_pred_lgbm)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.9686800894854586
Precision: 0.9259259259259259
Recall: 0.4143646408839779
F1: 0.5725190839694656


<h1>GaussianNB

In [None]:
classifier_gnb = GaussianNB()
# train
classifier_gnb.fit(X_train, y_train)
# predict
y_pred_gnb = classifier_gnb.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_gnb)
precision = precision_score(y_test, y_pred_gnb)
recall = recall_score(y_test, y_pred_gnb)
f1 = f1_score(y_test, y_pred_gnb)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8431208053691275
Precision: 0.2147147147147147
Recall: 0.7900552486187845
F1: 0.3376623376623376


<h1> LSTM

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense

# Convert DataFrame to NumPy array and reshape
X_train_reshaped = X_train.values.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_reshaped = X_test.values.reshape(X_test.shape[0], 1, X_test.shape[1])
model = Sequential([
    LSTM(50, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    Dense(1, activation='sigmoid')
])

# Compile the model (adjust loss and optimizer as needed)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit your model to the training data
model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b00309ebfa0>

In [None]:
# Predict using the trained model
y_pred_prob = model.predict(X_test_reshaped)

# Convert probabilities to binary classes using a threshold (e.g., 0.5)
threshold = 0.5
y_pred_lstm = (y_pred_prob > threshold).astype(int)

# Calculate evaluation metrics
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
precision_lstm = precision_score(y_test, y_pred_lstm)
recall_lstm = recall_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm)

# Print the evaluation metrics
print("LSTM Model Metrics:")
print("Accuracy:", accuracy_lstm)
print("Precision:", precision_lstm)
print("Recall:", recall_lstm)
print("F1 Score:", f1_lstm)

LSTM Model Metrics:
Accuracy: 0.9493847874720358
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


<h1> Rare Event Logistic

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Create the logistic regression model with class_weight='balanced'
logistic_model = LogisticRegression(class_weight='balanced')

# Fit the model on the training data
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9021252796420581
Precision: 0.3039443155452436
Recall: 0.7237569060773481
F1 Score: 0.42810457516339867


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create the logistic regression model with a higher penalty for misclassifying the minority class
logistic_model = LogisticRegression(class_weight='balanced', solver='liblinear')

# Fit the model on the training data
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred_logistic)
precision = precision_score(y_test, y_pred_logistic)
recall = recall_score(y_test, y_pred_logistic)
f1 = f1_score(y_test, y_pred_logistic)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9074384787472036
Precision: 0.31343283582089554
Recall: 0.6961325966850829
F1 Score: 0.4322469982847341
