# Analyze Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score
# from xgboost import XGBClassifier


# Load the dataset from the data directory
file_path = "../data/Credit Card Defaulter Prediction.csv"  # Adjust path based on your file name
df = pd.read_csv(file_path)

# Display the first few rows
df.head()


In [1]:
# Drop ID column
df.drop(columns=['ID'], inplace=True)

# Check data types and missing values
df.info()  
df.describe()

NameError: name 'df' is not defined

## Handle missing values

In [None]:
# Check missing values
df.isnull().sum()

In [None]:
# if there were missing values
# For numerical features (BILL_AMT, PAY_AMT), replace missing values with the median:
# df.fillna(df.median(), inplace=True)

# For categorical features (SEX, EDUCATION, MARRIAGE), replace missing values with the mode:
# df.fillna(df.mode().iloc[0], inplace=True)

## Standardize Categorical Variables

In [None]:
df['SEX'] = df['SEX'].map({'M': 1, 'F': 0})  # 1 = Male, 0 = Female

df['MARRIAGE'] = df['MARRIAGE'].map({'Married': 1, 'Single': 2, 'Others': 3})

df['EDUCATION'] = df['EDUCATION'].map({
    'Graduate School': 1,
    'University': 2,
    'High School': 3,
    'Others': 4
})

## Duplicate Check

In [None]:
df.duplicated().sum()

In [None]:
# Duplicate removal
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
important_features = [
    'LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE',
    'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
    'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
    'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
]

df = df[important_features]
df.head()

## Outliers Check

In [None]:

plt.figure(figsize=(12,6))
sns.boxplot(data=df[['LIMIT_BAL', 'AGE', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']])
plt.xticks(rotation=45)
plt.title("Boxplot of Continuous Features")
plt.show()

In [None]:
# Capping extreme outliers
df = df[df['AGE'] < 100]  # Remove unrealistic ages
df['PAY_AMT1'] = df['PAY_AMT1'].clip(upper=df['PAY_AMT1'].quantile(0.99))  # Cap payments at 99th percentile

# Understanding Dataset Structure

In [None]:
df.shape

In [None]:
df.columns


In [None]:
df.describe()

In [None]:
# Visualize Correlation Between Features
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Correlation Heatmap")
plt.show()

### Distribution of Credit Limits (LIMIT_BAL)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(df['LIMIT_BAL'], bins=50, kde=True)
plt.title("Distribution of Credit Limits (LIMIT_BAL)")
plt.xlabel("Credit Limit")
plt.ylabel("Number of Customers")
plt.show()

### Payment Behavior Over Time (PAY_0 to PAY_6)

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']])
plt.title("Repayment Status Over Time")
plt.xlabel("Months (PAY_0 = Most Recent)")
plt.ylabel("Repayment Status (-1 = On Time, 1+ = Delayed)")
plt.show()

## Feature Data Transformation

In [None]:
# Create a "Maximum Delay" Feature
'''If a customer has delayed payments for multiple months, we can create a single max delay score'''

df["Max_Delay"] = df[["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]].max(axis=1)

In [None]:
# Create a Debt Utilization Ratio Feature
'''Meausre how much of the credit limit a customer is using'''

df["Debt_Utilization"] = df["BILL_AMT1"] / df["LIMIT_BAL"]

In [None]:
# Create a average payment ratio
'''A ratio of previous payments vs. billed amounts to identify those who only pay minimums.'''

df["Avg_Payment_Ratio"] = df[["PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6"]].mean(axis=1) / df[["BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]].mean(axis=1)
df["Avg_Payment_Ratio"].fillna(df["Avg_Payment_Ratio"].median(), inplace=True)

In [None]:
# Create a High Risk Flag 
'''Mark customers as high risk if they had 3 or more months of delayed payments.'''

df["High_Risk"] = df[["PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]].apply(lambda x: sum(x >= 2) >= 3, axis=1).astype(int)

## Train Model

In [None]:
# Train-Test Split

X = df[["LIMIT_BAL", "AGE", "Max_Delay", "Debt_Utilization", "Avg_Payment_Ratio"]]  # Select meaningful features
y = df["High_Risk"]  # Predicting if a customer is high risk

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
'''Issues with X values containing infinity values or a value too large for dtype'''
# if infinity values replace
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)  # Convert inf to NaN
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# fill values with median
X_train.fillna(X_train.median(), inplace=True)
X_test.fillna(X_test.median(), inplace=True)

In [None]:
# Train a logistic regression model

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# importance feature check

feature_importance = pd.Series(model.coef_[0], index=X.columns).sort_values(ascending=False)
print(feature_importance)

## Model Optimization Exploration

In [None]:
# Define the hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs']  # Different solvers to try
}

# Initialize Logistic Regression
log_model = LogisticRegression(max_iter=1000)

# Perform Grid Search
grid_search = GridSearchCV(log_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Logistic Regression Parameters:", grid_search.best_params_)

# Train the best model
best_log_model = grid_search.best_estimator_