###Step 1: Upload and unzip the dataset archive

In [None]:

from google.colab import files

# upload the  file
uploaded = files.upload()

# Unzip the archive
import zipfile
import os

archive_name = "dataset"

with zipfile.ZipFile(archive_name, 'r') as zip_ref:
    zip_ref.extractall()  # Extract all files to the current directory
print("Archive unzipped successfully!")
print("Files in directory:", os.listdir())

In [20]:
import io
uploaded_filename = list(uploaded.keys())[0] # Get the filename

# Specify the archive name (updated with actual uploaded filename)
archive_name = uploaded_filename

# Open the zip file with io.BytesIO to handle the uploaded file format
with zipfile.ZipFile(io.BytesIO(uploaded[archive_name]), 'r') as zip_ref:
    zip_ref.extractall()  # Extract all files to the current directory
print("Archive unzipped successfully!")
print("Files in directory:", os.listdir())

Archive unzipped successfully!
Files in directory: ['.config', 'dataset', 'dataset (1).zip', 'fraudTest.csv', 'fraudTrain.csv', '.ipynb_checkpoints', 'sample_data']


### Step 2: Import necessary libraries

In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE  # For handling class imbalance

###Step 3: Load the dataset

In [2]:
data_file = 'fraudTrain.csv'
df = pd.read_csv(data_file)

print("Dataset loaded successfully!")
print(df.head())
print(df.info())

Dataset loaded successfully!
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Ri

In [5]:
print("Column names:", df.columns)


Column names: Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


In [15]:
print("Data types of features (X):\n", X.dtypes)
print("Sample rows from features (X):\n", X.head())


Data types of features (X):
 cc_num                       int64
merchant                    object
amt                        float64
city_pop                     int64
transaction_year             int32
transaction_month            int32
transaction_day              int32
transaction_hour             int32
category_food_dining          bool
category_gas_transport        bool
category_grocery_net          bool
category_grocery_pos          bool
category_health_fitness       bool
category_home                 bool
category_kids_pets            bool
category_misc_net             bool
category_misc_pos             bool
category_personal_care        bool
category_shopping_net         bool
category_shopping_pos         bool
category_travel               bool
gender_M                      bool
dtype: object
Sample rows from features (X):
              cc_num                            merchant     amt  city_pop  \
0  2703186189652095          fraud_Rippin, Kub and Mann    4.97      3495   
1

### Step 4: Data Preprocessing

In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())


from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import pandas as pd

# Drop unnecessary columns
columns_to_drop = ['Unnamed: 0', 'first', 'last', 'street', 'city', 'state', 'zip', 'lat',
                   'long', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long']
df = df.drop(columns=columns_to_drop, axis=1, errors='ignore')

# Convert categorical columns
categorical_columns = ['category', 'gender']
for col in categorical_columns:
    if col in df.columns:
        label_encoder = LabelEncoder()
        df[col] = label_encoder.fit_transform(df[col])

# Check for numeric-only features
X = df.drop('is_fraud', axis=1)  # Features
y = df['is_fraud']  # Target





In [16]:
# Handle 'merchant' column

if 'merchant' in X.columns:

    # Encode the column
    from sklearn.preprocessing import LabelEncoder
    label_encoder = LabelEncoder()
    X['merchant'] = label_encoder.fit_transform(X['merchant'])
    print("Encoded the 'merchant' column.")

# Verify all columns in X are now numeric
print("Data types of features (X):\n", X.dtypes)
print("Sample rows:\n", X.head())

# Apply SMOTE
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X, y)

# Verify class distribution after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_resampled).value_counts())


Encoded the 'merchant' column.
Data types of features (X):
 cc_num                       int64
merchant                     int64
amt                        float64
city_pop                     int64
transaction_year             int32
transaction_month            int32
transaction_day              int32
transaction_hour             int32
category_food_dining          bool
category_gas_transport        bool
category_grocery_net          bool
category_grocery_pos          bool
category_health_fitness       bool
category_home                 bool
category_kids_pets            bool
category_misc_net             bool
category_misc_pos             bool
category_personal_care        bool
category_shopping_net         bool
category_shopping_pos         bool
category_travel               bool
gender_M                      bool
dtype: object
Sample rows:
              cc_num  merchant     amt  city_pop  transaction_year  \
0  2703186189652095       514    4.97      3495              2019   
1   

In [17]:
# Step 6: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Step 7: Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)

# Step 8: Make predictions and evaluate the model
y_pred = clf.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1]))

Confusion Matrix:
 [[385697    637]
 [   344 386824]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    386334
           1       1.00      1.00      1.00    387168

    accuracy                           1.00    773502
   macro avg       1.00      1.00      1.00    773502
weighted avg       1.00      1.00      1.00    773502


ROC-AUC Score: 0.9999861018395535


In [18]:
from sklearn.metrics import accuracy_score, f1_score

# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Calculate F1-Score
f1 = f1_score(y_test, y_pred)
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9987
F1-Score: 0.9987


###Final scores:
####ROC-AUC Score: 0.9999861018395535
####Accuracy: 0.9987
####F1-Score: 0.9987

In [19]:
# Evaluate on Training Set
y_train_pred = clf.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.4f}")

# Evaluate on Test Set
y_test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")


Training Accuracy: 1.0000
Test Accuracy: 0.9987
