In [None]:
# Import necessary libraries
import sys
import pandas as pd
import scorecardpy as sc
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import joblib  # For saving models

warnings.filterwarnings('ignore')

# Load the custom scripts
sys.path.append('../scripts')
from scripts.load_data import load_data

# Step 1: Load the data
data = load_data('../data/data.csv')

# Step 2: Drop irrelevant columns (with too many unique values)
data.drop(columns=['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode'], inplace=True)

# Step 3: Extract temporal features
data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'], errors='coerce')

# Extract useful temporal features
data['transaction_hour'] = data['TransactionStartTime'].dt.hour
data['transaction_day'] = data['TransactionStartTime'].dt.day
data['transaction_month'] = data['TransactionStartTime'].dt.month
data['transaction_year'] = data['TransactionStartTime'].dt.year

# Drop TransactionStartTime after extracting temporal features
data.drop(columns=['TransactionStartTime'], inplace=True)

# Step 4: Split the data into training and testing sets (70/30)
train, test = sc.split_df(data, 'FraudResult', ratio=0.7, seed=999).values()

# Step 5: Apply WoE binning
woe_bins = sc.woebin(train, y='FraudResult')

# Step 6: Transform the training and testing sets using WoE
train_woe = sc.woebin_ply(train, woe_bins)
test_woe = sc.woebin_ply(test, woe_bins)

# Drop target from the training and testing sets
X_train = train_woe.drop(columns=['FraudResult'])
y_train = train_woe['FraudResult']
X_test = test_woe.drop(columns=['FraudResult'])
y_test = test_woe['FraudResult']

# Step 7: Model Selection and Training
# Logistic Regression
log_reg = LogisticRegression(solver='liblinear')
log_reg.fit(X_train, y_train)

# Random Forest
rf_clf = RandomForestClassifier(random_state=999)
rf_clf.fit(X_train, y_train)

# Step 8: Hyperparameter Tuning (Optional)
# Grid Search for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_clf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# Best parameters from grid search
best_rf = grid_search_rf.best_estimator_

# Step 9: Model Evaluation
# Logistic Regression Evaluation
log_reg_pred = log_reg.predict(X_test)
log_reg_pred_prob = log_reg.predict_proba(X_test)[:, 1]
log_reg_accuracy = accuracy_score(y_test, log_reg_pred)
log_reg_auc = roc_auc_score(y_test, log_reg_pred_prob)

print(f"Logistic Regression Accuracy: {log_reg_accuracy}")
print(f"Logistic Regression AUC: {log_reg_auc}")
print(classification_report(y_test, log_reg_pred))

# Random Forest Evaluation
rf_pred = best_rf.predict(X_test)
rf_pred_prob = best_rf.predict_proba(X_test)[:, 1]
rf_accuracy = accuracy_score(y_test, rf_pred)
rf_auc = roc_auc_score(y_test, rf_pred_prob)

print(f"Random Forest Accuracy: {rf_accuracy}")
print(f"Random Forest AUC: {rf_auc}")
print(classification_report(y_test, rf_pred))

# Step 10: Confusion Matrix Visualization
conf_matrix = confusion_matrix(y_test, rf_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='coolwarm')
plt.title('Random Forest Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC-AUC Curve for Random Forest
sc.perf_eva(y_test, rf_pred_prob, title="Random Forest - Test Set")

# Logistic Regression Performance Evaluation
sc.perf_eva(y_test, log_reg_pred_prob, title="Logistic Regression - Test Set")

# Step 11: Save Models for Task 5
# Save Logistic Regression model
joblib.dump(log_reg, '../models/logistic_regression_model.pkl')

# Save Random Forest model
joblib.dump(best_rf, '../models/random_forest_model.pkl')

# Save the WoE binning used in this pipeline
joblib.dump(woe_bins, '../models/woe_bins.pkl')

# Optionally, you can save the train and test sets
joblib.dump((X_train, y_train), '../models/train_data.pkl')
joblib.dump((X_test, y_test), '../models/test_data.pkl')

print("Models and data have been saved successfully!")
