In [None]:
# Importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
import seaborn as sns
import joblib
import os
# Step 1: Load the datasets
# Load creditcard.csv and fraud_data from Task 1
fraud_data = pd.read_csv('../data/Fraud_Data.csv')

# Step 2: Data Overview
print("Fraud Data Sample:")
print(fraud_data.head(), "\n")

# Step 3: Data Preparation
# Convert categorical columns ('source', 'browser', 'sex') into numeric using one-hot encoding
fraud_data_encoded = pd.get_dummies(fraud_data, columns=['source', 'browser', 'sex'], drop_first=True)

# Drop non-numeric columns that we do not need for modeling
fraud_data_encoded = fraud_data_encoded.drop(columns=['signup_time', 'purchase_time', 'device_id'])

# Feature and target separation
X_fraud = fraud_data_encoded.drop(columns=['class'])  # Features: Engineered and encoded columns
y_fraud = fraud_data_encoded['class']  # Target (fraud label: 1=Fraud, 0=Not Fraud)

# Check if the data is now fully numeric
print("\nFraud Data Types (after encoding and removing non-numeric columns):")
print(X_fraud.dtypes)

# Step 4: Split the dataset into training and testing sets (80% training, 20% testing)
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Display the shape of the training and testing sets
print(f"\nFraud Data - Training Set Shape: {X_train_fraud.shape}, Testing Set Shape: {X_test_fraud.shape}\n")

# Step 5: Model Building and Training
# Random Forest on Fraud Data
print("Training Random Forest Model on Fraud Data...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_fraud, y_train_fraud)

# Predictions and evaluation
y_pred_fraud = rf_model.predict(X_test_fraud)
print("Random Forest - Fraud Data - Classification Report:")
print(classification_report(y_test_fraud, y_pred_fraud))
print(f"Random Forest Accuracy on Fraud Data: {accuracy_score(y_test_fraud, y_pred_fraud)}\n")

# Optional: Plotting Feature Importance
# Plot the feature importance of the Random Forest model on fraud data
feature_importances = rf_model.feature_importances_
features = X_train_fraud.columns
importances_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importances_df = importances_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importances_df)
plt.title('Feature Importances in Random Forest (Fraud Data)')
plt.show()


# Create the models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the Random Forest model to the models directory
model_path = '../models/random_forest_fraud_model.pkl'
joblib.dump(rf_model, model_path)

print(f"Random Forest model saved at: {model_path}")
