In [31]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report, mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib

In [2]:
data = pd.read_csv('data_merged.csv')

In [3]:
data.head()

Unnamed: 0,FraudResult,Is_Positive_Amount_woe,ProductCategory_financial_services_woe,Total_Transaction_Amount_woe,ChannelId_ChannelId_3_woe,Transaction_Hour_woe,ProductCategory_Encoded_woe,Value_log_woe,ChannelId_Encoded_woe,Value_woe,...,Amount_woe,PricingStrategy_woe,Std_Transaction_Amount_woe,Avg_Transaction_Amount_woe,Amount_log_woe,CustomerId,Recency,Frequency,Monetary_Total,Monetary_Avg
0,0,0.484515,-1.154666,-3.891521,0.472374,-0.231181,-1.559112,-4.209236,0.454076,-3.742537,...,-3.932398,0.094399,-2.806436,-3.071208,-3.918154,CustomerId_4406,0,119,109921.75,923.712185
1,0,-2.736867,0.565446,-3.891521,-2.162969,-0.231181,0.569263,-4.171778,-2.135528,-4.183281,...,-2.736867,0.094399,-2.806436,-3.071208,-2.736867,CustomerId_4406,0,119,109921.75,923.712185
2,0,0.484515,-1.154666,-2.862343,0.472374,-0.231181,-1.559112,-4.171778,0.454076,-4.183281,...,-3.932398,0.094399,-2.806436,-3.071208,-3.918154,CustomerId_4683,81,2,1000.0,500.0
3,0,0.484515,-1.154666,-3.891521,0.472374,-0.231181,0.569263,1.143627,0.454076,0.807103,...,1.060141,0.094399,-2.806436,-3.4207,1.049062,CustomerId_988,5,38,228727.2,6019.136842
4,0,-2.736867,0.565446,-3.891521,-2.162969,-0.231181,0.569263,-4.171778,-2.135528,-4.183281,...,-2.736867,0.094399,-2.806436,-3.4207,-2.736867,CustomerId_988,5,38,228727.2,6019.136842


In [4]:
# Define features and target variable
X = data.drop(columns=['FraudResult'])
y = data['FraudResult']

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Drop non-numeric or irrelevant columns
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

In [8]:
# Train Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train, y_train)

In [13]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

In [14]:
# Parameter grid for Logistic Regression
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [100, 500, 1000]
}

# Grid Search for Logistic Regression
lr_grid = GridSearchCV(estimator=LogisticRegression(random_state=42),
                       param_grid=lr_param_grid,
                       cv=5, scoring='roc_auc', n_jobs=-1)
lr_grid.fit(X_train, y_train)
best_lr_model = lr_grid.best_estimator_
print("Best Logistic Regression Parameters:", lr_grid.best_params_)

Best Logistic Regression Parameters: {'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'lbfgs'}


In [16]:
# Parameter distribution for Random Forest
rf_param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Random Search for Random Forest
rf_random = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                               param_distributions=rf_param_dist,
                               n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
rf_random.fit(X_train, y_train)
best_rf_model = rf_random.best_estimator_
print("Best Random Forest Parameters:", rf_random.best_params_)

Best Random Forest Parameters: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None}


In [18]:
# Logistic Regression Evaluation
lr_tuned_probs = best_lr_model.predict_proba(X_test)[:, 1]
lr_tuned_preds = best_lr_model.predict(X_test)
print("Tuned Logistic Regression ROC-AUC:", roc_auc_score(y_test, lr_tuned_probs))
print("Tuned Logistic Regression Report:\n", classification_report(y_test, lr_tuned_preds))

Tuned Logistic Regression ROC-AUC: 0.7466965055474535
Tuned Logistic Regression Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19094
           1       0.55      0.15      0.24        39

    accuracy                           1.00     19133
   macro avg       0.77      0.58      0.62     19133
weighted avg       1.00      1.00      1.00     19133



In [19]:
# Random Forest Evaluation
rf_tuned_probs = best_rf_model.predict_proba(X_test)[:, 1]
rf_tuned_preds = best_rf_model.predict(X_test)
print("Tuned Random Forest ROC-AUC:", roc_auc_score(y_test, rf_tuned_probs))
print("Tuned Random Forest Report:\n", classification_report(y_test, rf_tuned_preds))

Tuned Random Forest ROC-AUC: 0.9973269895496774
Tuned Random Forest Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19094
           1       0.67      0.31      0.42        39

    accuracy                           1.00     19133
   macro avg       0.83      0.65      0.71     19133
weighted avg       1.00      1.00      1.00     19133



In [20]:
# Credit Score Calculation
def calculate_credit_score(probabilities, base_score=300, scaling_factor=550):
    return base_score + scaling_factor * (1 - probabilities)

# Generate Credit Scores
lr_credit_scores = calculate_credit_score(lr_tuned_probs)
rf_credit_scores = calculate_credit_score(rf_tuned_probs)

print("Logistic Regression Credit Scores (Example):", lr_credit_scores[:10])
print("Random Forest Credit Scores (Example):", rf_credit_scores[:10])

Logistic Regression Credit Scores (Example): [849.99999765 850.         850.         743.81552072 849.99968533
 849.9964649  849.99999998 840.05523043 849.40312716 849.99523625]
Random Forest Credit Scores (Example): [849.78       850.         850.         850.         849.8625
 850.         850.         850.         850.         849.93888889]


In [22]:
# Targets for Loan Prediction (Replace with actual data if available)
loan_amount = data['Monetary_Total']
loan_duration = data['Frequency']

# Split data for Loan Amount prediction
X_train_reg, X_test_reg, y_train_amount, y_test_amount = train_test_split(X.select_dtypes(include=[np.number]), loan_amount, test_size=0.2, random_state=42)

# Loan Amount Model
loan_amount_model = RandomForestRegressor(random_state=42, n_estimators=100)
loan_amount_model.fit(X_train_reg, y_train_amount)
predicted_loan_amount = loan_amount_model.predict(X_test_reg)

In [24]:
# Split data for Loan Duration prediction
X_train_reg, X_test_reg, y_train_duration, y_test_duration = train_test_split(X.select_dtypes(include=[np.number]), loan_duration, test_size=0.2, random_state=42)

# Loan Duration Model
loan_duration_model = RandomForestRegressor(random_state=42, n_estimators=100)
loan_duration_model.fit(X_train_reg, y_train_duration)
predicted_loan_duration = loan_duration_model.predict(X_test_reg)

In [29]:
# Evaluate Loan Predictions
print("Loan Amount MAE:", mean_absolute_error(y_test_amount, predicted_loan_amount))
print("Loan Amount RMSE:", np.sqrt(mean_squared_error(y_test_amount, predicted_loan_amount)))
print("Loan Amount R^2:", r2_score(y_test_amount, predicted_loan_amount))

print("Loan Duration MAE:", mean_absolute_error(y_test_duration, predicted_loan_duration))
print("Loan Duration RMSE:",  np.sqrt(mean_squared_error(y_test_duration, predicted_loan_duration)))
print("Loan Duration R^2:", r2_score(y_test_duration, predicted_loan_duration))

Loan Amount MAE: 46.70218370370321
Loan Amount RMSE: 3117.6539619236296
Loan Amount R^2: 0.9999999796668172
Loan Duration MAE: 0.0
Loan Duration RMSE: 0.0
Loan Duration R^2: 1.0


In [32]:
# Save models
joblib.dump(best_lr_model, 'logistic_regression_model.pkl')
joblib.dump(best_rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']