In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, cross_val_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# set the aesthetic style of the plots
sns.set_style()

# filter warning messages
import warnings
warnings.filterwarnings('ignore')


# Assuming feature_importance is your DataFrame
# ... (your code to create and sort feature_importance) ...

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)  # None means unlimited rows
pd.set_option('display.max_columns', None) # None means unlimited columns
pd.set_option('display.width', None)      # None means auto-detect width
pd.set_option('display.max_colwidth', None) # None means unlimited column width


In [None]:
X_processed = pd.read_csv('saved/feature_engineered_data.csv')


In [None]:
X_processed.head(5)

In [None]:
from sklearn.feature_selection import f_classif

# X_processed = X_processed.drop(columns=['AgeBin', 'CreditScoreBin', 'EmploymentStatus', 'MaritalStatus', 'HomeOwnershipStatus', 'EducationLevel', 'LoanPurpose'])

X = X_processed.drop(columns=['RiskScore'])
y = X_processed['RiskScore']
# print(X.columns)

f_scores, p_values = f_classif(X, y)

# Create a DataFrame to display results
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'F-Score': f_scores,
    'P-Value': p_values
})

# Sort by F-Score (higher F-Score means more important)
feature_importance = feature_importance.sort_values(by='F-Score', ascending=False)

print(feature_importance)

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.1)
print(len(X_train), len(X_test))
print("NaN in X_train:", X_train.isna().sum().sum())
print("NaN in y_train:", y_train.isna().sum())
print("Infinite values in X:", np.isinf(y_train.values).sum())



import json

with open('saved/df_train_encoded.json', 'w') as f:
    json.dump(X_train.columns.tolist(), f)

In [None]:
X_train_rus, y_train_rus = (X_train, y_train)

f_scores, p_values = f_classif(X_train_rus, y_train_rus)

# Create a DataFrame to display feature importance
yo = pd.DataFrame({'Feature': X_train_rus.columns, 'F-Score': f_scores, 'P-Value': p_values})

# Sort by F-Score (higher F-Score means more important)
feature_importance = yo.sort_values(by='F-Score', ascending=False)

# Print the feature importance
print(feature_importance)

In [None]:
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd

# Apply PCA
pca = PCA(n_components=len(X_train_rus.columns))
pca.fit(X_train_rus)

# Get explained variance ratio
explained_variance = pca.explained_variance_ratio_ * 100

# Create DataFrame
variance_df = pd.DataFrame({'Feature': X_train_rus.columns, 'Explained Variance (%)': explained_variance})
variance_df = variance_df.sort_values(by='Explained Variance (%)', ascending=False)

# print(variance_df)


In [None]:
print(X_train_rus.head(1))

In [None]:
# final XGBoost model
xgb = XGBRegressor(
    max_depth=9, 
    learning_rate=0.1,  
    n_estimators=200, 
    gamma=1,  
    min_child_weight=3,
    random_state=42  # Ensures reproducibility
)
xgb.fit(X_train_rus, y_train_rus)
# prediction

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Predictions
y_pred_xgb = xgb.predict(X_test)

# Calculate Metrics
mae = mean_absolute_error(y_test, y_pred_xgb)
mse = mean_squared_error(y_test, y_pred_xgb)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_xgb)

# Print Metrics
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")


In [None]:
import xgboost

fig, ax = plt.subplots(figsize=(10, 8))
xgboost.plot_importance(xgb, importance_type='gain', ax=ax) # or 'weight', 'cover'
plt.show()

print(xgb.feature_importances_)


import shap
import matplotlib.pyplot as plt

# Assuming you have your data (X, y) and trained model (model)
# model = xgb.XGBRegressor().fit(X, y) or model = xgb.XGBClassifier().fit(X,y)

# explainer = shap.TreeExplainer(xgb)
# shap_values = explainer.shap_values(X)

# # Summary plot (global importance)
# shap.summary_plot(shap_values, X)

# # Force plot (local explanation for a single prediction)
# shap.initjs()
# shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:]) #first row of X.

In [None]:
import pickle

# Save label encoders to disk
with open('saved/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)