In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
import os


In [2]:
# Define the path to the dataset
data_path = '../data/TransactionDataset1.csv'

# Check if the file exists before loading
if os.path.exists(data_path):
    data = pd.read_csv(data_path)
    print("Dataset loaded successfully.")
else:
    print(f"Error: Dataset not found at {data_path}")
    print("Please make sure you have run the notebooks in '01_data_generation' first.")

# Drop unnecessary columns
drop_columns = ['user_id', 'name', 'addresses', 'email_address', 'transaction_id', 'transaction_date']
data = data.drop(drop_columns, axis=1)

# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

print("Data preprocessing complete.")
data.head()


Dataset loaded successfully.
Data preprocessing complete.


Unnamed: 0,age,kyc_status,days_since_kyc_incomplete,transaction_amount,home_branch,transaction_location,transaction_method,transaction_category,transaction_merchant,transaction_time,average_expenditure,comparison_with_avg_expenditure,transaction_count_7_days,fraud_indicator,suspicion_indicator
0,58,0,0,56806.477708,10025,12338,2,2,9770,1869,48294.020252,8512.457457,8,0,1
1,33,0,0,9654.458213,11037,3456,3,2,11743,9614,56937.733646,-47283.275433,8,0,1
2,61,0,0,4067.08965,12486,12338,3,0,13601,15921,17774.789744,-13707.700094,7,0,0
3,23,0,0,46332.094201,7222,6639,3,3,12617,3799,24007.102076,22324.992125,7,0,0
4,19,1,59,60579.284786,5112,4681,3,2,5271,14224,19388.935894,41190.348892,11,0,1


In [3]:
# Split the data into features (X) and target variable (y)
X = data.drop('fraud_indicator', axis=1)
y = data['fraud_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Decision Tree model
model = DecisionTreeClassifier(random_state=42)

# Train the model
print("Training the Decision Tree model...")
model.fit(X_train, y_train)
print("Model training complete.")

Training the Decision Tree model...
Model training complete.


In [4]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results
print(f'Accuracy: {accuracy:.2f}')
print('\nConfusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_rep)

Accuracy: 0.96

Confusion Matrix:
[[2493   76]
 [  75 1356]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      2569
           1       0.95      0.95      0.95      1431

    accuracy                           0.96      4000
   macro avg       0.96      0.96      0.96      4000
weighted avg       0.96      0.96      0.96      4000

