In [None]:
from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript('google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})'))
get_ipython().events.register('pre_run_cell', resize_colab_cell)

<IPython.core.display.Javascript object>

In [None]:

# Libraries
import numpy as np
import pandas as pd
import os
import gc
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import pickle
from sklearn.model_selection import train_test_split

# ML packages
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import LocalOutlierFactor

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Mounted at /content/drive


In [None]:
# Loading data
folder_path = '/content/drive/MyDrive/BDA Project/ieee-fraud-detection'
train_identity = pd.read_csv(f'{folder_path}/train_identity.csv')
train_transaction = pd.read_csv(f'{folder_path}/train_transaction.csv')

sub = pd.read_csv(f'{folder_path}/sample_submission.csv')
# let's combine the data and work with the whole dataset
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')

print(f'Train dataset has {train.shape[0]} rows and {train.shape[1]} columns.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Train dataset has 590540 rows and 434 columns.


In [None]:
# Adjust display settings to show more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Display the DataFrame data types
print(train_transaction.isna().sum())

# Reset display settings to default if needed
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
card1                  0
card2               8933
card3               1565
card4               1577
card5               4259
card6               1571
addr1              65706
addr2              65706
dist1             352271
dist2             552913
P_emaildomain      94456
R_emaildomain     453249
C1                     0
C2                     0
C3                     0
C4                     0
C5                     0
C6                     0
C7                     0
C8                     0
C9                     0
C10                    0
C11                    0
C12                    0
C13                    0
C14                    0
D1                  1269
D2                280797
D3                262878
D4                168922
D5                309841
D6                517353
D7                551623
D8                515614
D9                515614


In [None]:
train.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [None]:
# pre-processing
del train_identity, train_transaction #, test_identity, test_transaction

print(f'There are {train.isnull().any().sum()} columns in train dataset with missing values.')

one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]

print(f'There are {len(one_value_cols)} columns in train dataset with one unique value.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

There are 414 columns in train dataset with missing values.
There are 0 columns in train dataset with one unique value.


In [None]:
# Feature Engineering
train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Data Pre-processing
train["isFraud"].value_counts()

many_null_cols = [col for col in train.columns if train[col].isnull().sum() / train.shape[0] > 0.7]

big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna=False, normalize=True).values[0] > 0.7]

cols_to_drop = list(set(many_null_cols + big_top_value_cols + one_value_cols))
cols_to_drop.remove('isFraud')
print(len(cols_to_drop))

train = train.drop(cols_to_drop, axis=1)

cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

352


In [None]:
gc.collect()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

36

In [None]:
X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
y = train.sort_values('TransactionDT')['isFraud']

print(X.shape)
print(y.shape)

del train

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(590540, 101)
(590540,)


In [None]:
def clean_inf_nan_chunked(df, chunksize=10000):
    total_rows = len(df)
    chunks = [df[i:i + chunksize] for i in range(0, total_rows, chunksize)]

    # Process each chunk
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}")
        chunk.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Concatenate the processed chunks back into a DataFrame
    return pd.concat(chunks, ignore_index=True)

# Clean infinite values to NaN in chunks
X = clean_inf_nan_chunked(X)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing chunk 1/60
Processing chunk 2/60
Processing chunk 3/60
Processing chunk 4/60
Processing chunk 5/60
Processing chunk 6/60
Processing chunk 7/60
Processing chunk 8/60
Processing chunk 9/60
Processing chunk 10/60
Processing chunk 11/60
Processing chunk 12/60
Processing chunk 13/60
Processing chunk 14/60
Processing chunk 15/60
Processing chunk 16/60
Processing chunk 17/60
Processing chunk 18/60
Processing chunk 19/60
Processing chunk 20/60


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk.replace([np.inf, -np.inf], np.nan, inplace=True)


Processing chunk 21/60
Processing chunk 22/60
Processing chunk 23/60
Processing chunk 24/60
Processing chunk 25/60
Processing chunk 26/60
Processing chunk 27/60
Processing chunk 28/60
Processing chunk 29/60
Processing chunk 30/60
Processing chunk 31/60
Processing chunk 32/60
Processing chunk 33/60
Processing chunk 34/60
Processing chunk 35/60
Processing chunk 36/60
Processing chunk 37/60
Processing chunk 38/60
Processing chunk 39/60
Processing chunk 40/60
Processing chunk 41/60
Processing chunk 42/60
Processing chunk 43/60
Processing chunk 44/60
Processing chunk 45/60
Processing chunk 46/60
Processing chunk 47/60
Processing chunk 48/60
Processing chunk 49/60
Processing chunk 50/60
Processing chunk 51/60
Processing chunk 52/60
Processing chunk 53/60
Processing chunk 54/60
Processing chunk 55/60
Processing chunk 56/60
Processing chunk 57/60
Processing chunk 58/60
Processing chunk 59/60
Processing chunk 60/60


In [None]:
gc.collect()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0

In [None]:
# Oversampling
oversample = RandomUnderSampler(sampling_strategy=0.25)
X_over, y_over = oversample.fit_resample(X, y)

print(f'X has {X_over.shape[0]} rows and {X_over.shape[1]} columns.')
print(f'Y has {y_over.shape[0]} rows.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

X has 103315 rows and 101 columns.
Y has 103315 rows.


In [None]:
# Adjust display settings to show more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Display the DataFrame data types
print(X_over.dtypes)
print(X_over.isna().sum())

# Reset display settings to default if needed
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TransactionAmt                  float64
card1                             int64
card2                             int64
card4                             int64
card5                             int64
addr1                             int64
dist1                           float64
P_emaildomain                     int64
C1                              float64
C2                              float64
C5                              float64
C6                              float64
C9                              float64
C11                             float64
C13                             float64
C14                             float64
D1                              float64
D2                              float64
D3                              float64
D4                              float64
D5                              float64
D10                             float64
D11                             float64
D15                             float64
M1                                int64


In [None]:
X_over.head()
X_over.to_csv('x_over.csv',index=False)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# undersample = RandomUnderSampler(sampling_strategy='majority')
# X_under, y_under = undersample.fit_resample(X, y)

# print(f'Train dataset has {X_under.shape[0]} rows and {X_under.shape[1]} columns.')
# print(f'Test dataset has {y_under.shape[0]} rows.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Concatenate X and Y for simplicity
df = pd.concat([X_over, y_over], axis=1)


df.replace([np.inf, -np.inf], np.nan, inplace=True)
df_cleaned = df.interpolate(method='linear')

df_cleaned.shape

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  # You can choose 'median' or 'constant' as well
df_cleaned = pd.DataFrame(imputer.fit_transform(df_cleaned), columns=df_cleaned.columns)

# Separate X and Y after cleaning
X_cleaned = df_cleaned[X_over.columns]
Y_cleaned = df_cleaned['isFraud']

print(f'X cleaned has {X_cleaned.shape[0]} rows and {X_cleaned.shape[1]} columns.')
print(f'Y cleaned has {Y_cleaned.shape[0]} rows.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

X cleaned has 103315 rows and 101 columns.
Y cleaned has 103315 rows.


In [None]:
# X_cleaned.isna().sum()
# Adjust display settings to show more rows and columns
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

# Display the DataFrame data types
print(X_cleaned.isna().sum())

# Reset display settings to default if needed
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TransactionAmt                  0
card1                           0
card2                           0
card4                           0
card5                           0
addr1                           0
dist1                           0
P_emaildomain                   0
C1                              0
C2                              0
C5                              0
C6                              0
C9                              0
C11                             0
C13                             0
C14                             0
D1                              0
D2                              0
D3                              0
D4                              0
D5                              0
D10                             0
D11                             0
D15                             0
M1                              0
M2                              0
M3                              0
M4                              0
M5                              0
M6            

In [None]:
X_cleaned.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,TransactionAmt,card1,card2,card4,card5,addr1,dist1,P_emaildomain,C1,C2,...,TransactionAmt_to_std_card4,D15_to_mean_card1,D15_to_mean_card4,D15_to_std_card1,D15_to_std_card4,D15_to_mean_addr1,D15_to_mean_addr2,D15_to_std_addr1,D15_to_std_addr2,P_emaildomain_1
0,110.25,8643.0,163.0,4.0,54.0,240.0,133.664866,32.0,1.0,1.0,...,0.483128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0
1,280.0,1008.0,206.0,4.0,108.0,175.0,0.0,54.0,5.0,6.0,...,1.226991,2.612061,3.554124,2.356576,2.936092,3.941424,3.278842,3.059381,2.899523,43.0
2,40.0,6227.0,482.0,4.0,108.0,17.0,0.6,16.0,1.0,1.0,...,0.175284,1.871721,2.407919,1.680924,1.989202,2.552842,2.221415,2.040692,1.964427,15.0
3,117.0,12876.0,149.0,4.0,108.0,255.0,1.2,54.0,4.0,2.0,...,0.512707,1.131382,1.261714,1.005271,1.042313,1.16426,1.163989,1.022003,1.029331,43.0
4,226.0,12921.0,10.0,4.0,108.0,17.0,1.8,54.0,4.0,5.0,...,0.990357,0.037552,0.035541,0.029854,0.029361,0.034145,0.032788,0.029459,0.028995,43.0


In [None]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, Y_cleaned, test_size=0.33, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(69221, 101)
(69221,)
(34094, 101)
(34094,)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(X_train_scaled.shape)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

(69221, 101)


In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.decomposition import PCA

# pca = PCA(n_components=25)

# X_train_scaled = pca.fit_transform(X_train_scaled)
# X_test_scaled = pca.transform(X_test_scaled)

In [None]:

from sklearn.metrics import accuracy_score

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Logistic Regression
lr = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
# score =accuracy_score(y_test, pred)
print(score)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

0.8664281105179797


In [None]:
from sklearn.metrics import  accuracy_score, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

y_pred_proba = lr.predict_proba(X_test_scaled)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_pred_proba)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
A = (f"{accuracy:.4f},{precision:.4f},{recall:.4f},{f1:.4f},{auroc:.4f}")
A.split(',')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

['0.8664', '0.7699', '0.4694', '0.5832', '0.8640']

In [None]:
# XGBoost

import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train_scaled, y_train)
xgb_pred = xgb_clf.predict(X_test_scaled)
score =accuracy_score(y_test, xgb_pred)
print(score)

In [None]:
# Random Forests

from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100)
rf_clf.fit(X_train_scaled, y_train)

rf_pred = rf_clf.predict(X_test_scaled)

score = accuracy_score(y_test, rf_pred)
print(score)

In [None]:
# MLP
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(hidden_layer_sizes=(64, 16), learning_rate_init=0.001, learning_rate='adaptive', solver='sgd', max_iter=100)
mlp_clf.fit(X_train_scaled, y_train)

mlp_pred = mlp_clf.predict(X_test_scaled)

score = accuracy_score(y_test, mlp_pred)
print(score)

<IPython.core.display.Javascript object>



0.8487180918804953


In [None]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)
y_pred = knn_classifier.predict(X_test_scaled)

score = accuracy_score(y_test, mlp_pred)
print(score)

<IPython.core.display.Javascript object>

0.8487180918804953


In [None]:
# SVM
from sklearn.svm import LinearSVC

svc_clf = LinearSVC(random_state=0)
svc_clf.fit(X_train_scaled, y_train)

svc_pred = svc_clf.predict(X_test_scaled)

score = accuracy_score(y_test, svc_pred)
print(score)


<IPython.core.display.Javascript object>

0.7781420241997878




In [None]:
from sklearn.naive_bayes import GaussianNB

# Naive Bayes

nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

nb_pred = nb_clf.predict(X_test)

score =accuracy_score(y_test, nb_pred)
print(score)