In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



import xgboost as xgb
print(xgb.__version__)



from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

2.1.4


### Load the data set

In [7]:

#Load the dataset
df = pd.read_csv('/Users/saivyshnavigudipalli/Documents/604 assignmnet classwork/merged_dataset (1).csv')

df.head()

#check data type

df.info()

#Identifying the features and the target varibale

X = df.drop(columns=['Is.Fraudulent'])
y = df['Is.Fraudulent']

#check for missing values
df.isnull().sum()

# Check for duplicate rows
duplicate_rows = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Transaction.Date    300000 non-null  object 
 1   Transaction.Amount  300000 non-null  float64
 2   Customer.Age        300000 non-null  int64  
 3   Is.Fraudulent       300000 non-null  int64  
 4   Account.Age.Days    300000 non-null  int64  
 5   Transaction.Hour    300000 non-null  int64  
 6   source              300000 non-null  object 
 7   browser             300000 non-null  object 
 8   sex                 300000 non-null  object 
 9   Payment.Method      300000 non-null  object 
 10  Product.Category    300000 non-null  object 
 11  Quantity            300000 non-null  int64  
 12  Device.Used         300000 non-null  object 
 13  Address.Match       300000 non-null  int64  
dtypes: float64(1), int64(6), object(7)
memory usage: 32.0+ MB

Number of duplicate rows:

### Pre-Processing the data

In [39]:
#As we can see from above that there are 5 categorical variables we are going to use label encoding to change it to numerical values

#Convert categorical variables to numeric 


categorical_cols = ['source', 'browser', 'sex', 'Payment.Method', 'Product.Category']

df = df[df['Transaction.Amount'] >= 0]  # No negative transactions
df = df[(df['Customer.Age'] >= 18) & (df['Customer.Age'] <= 100)]  # Reasonable age range
df = df[df['Account.Age.Days'] >= 0]  # No negative account age
df = df[(df['Transaction.Hour'] >= 0) & (df['Transaction.Hour'] < 24)]  # Valid hour range


label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()  # Create an encoder for the column
    df[col] = le.fit_transform(df[col])  # Convert text labels into numbers
    label_encoders[col] = le  # Store the encoder for future use

    #Handle any missing or incorrect values

df = df[df['Customer.Age'] >= 0] # Remove negative ages
    # 4️⃣ Drop Irrelevant Columns (e.g., timestamps if present)
irrelevant_cols = ['Transaction.Timestamp'] if 'Transaction.Timestamp' in df.columns else []
df.drop(columns=irrelevant_cols, inplace=True)

df = df.drop(columns=['Transaction.Date', 'Device.Used'])


# 5️⃣ Check the final dataset
print("\nDataset After Preprocessing:\n", df.head())
print("✅ Data Cleaning & Preprocessing Completed!")

print("\nDataset After Preprocessing:\n", df.head())


Dataset After Preprocessing:
    Transaction.Amount  Customer.Age  Is.Fraudulent  Account.Age.Days  \
0              145.98            29              0               172   
1              677.62            40              0               250   
2              798.63            40              0               118   
3              314.65            34              0               187   
5              182.91            21              0               143   

   Transaction.Hour  source  browser  sex  Payment.Method  Product.Category  \
0                10       0        2    0               2                 3   
1                22       1        1    1               2                 0   
2                20       0        0    1               0                 0   
3                23       2        2    1               1                 4   
5                 5       1        1    0               3                 1   

   Quantity  Address.Match  
0         3              1  
1  

### Modeling

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score,roc_auc_score,confusion_matrix

#Split the dataset 

X= df.drop(columns=['Is.Fraudulent'])
y= df['Is.Fraudulent']

# Split into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)



# Initialize XGBoost classifier without 'use_label_encoder'
xgb_model = xgb.XGBClassifier(eval_metric='logloss')

# Train the model
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

#Evaluate the model using different metrics

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.4f}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.4f}')

# F1-Score
f1 = f1_score(y_test, y_pred)
print(f'F1-Score: {f1:.4f}')

# ROC-AUC Score
roc_auc = roc_auc_score(y_test, xgb_model.predict_proba(X_test)[:, 1])  # Probabilities for positive class
print(f'ROC-AUC Score: {roc_auc:.4f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


Accuracy: 0.9535
Precision: 0.8566
Recall: 0.4136
F1-Score: 0.5579
ROC-AUC Score: 0.7955
Confusion Matrix:
[[54195   288]
 [ 2440  1721]]
