# Credit Risk Prediction - Case 9

## 1. Introduction
This notebook implements a machine learning model to predict credit risk based on client data. 
It follows the specific requirements for **Case 9**, which mandates the use of **Gradient Boosting Classifier** with hyperparameter tuning via **GridSearchCV**.

### Dataset
The dataset contains client information including financial and personal indicators to predict credit approval risk.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set plot style
sns.set(style="whitegrid")

## 2. Data Loading

In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
train_df.head()

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Target Variable Distribution
plt.figure(figsize=(10, 6))
sns.countplot(y='Credit_Risk', data=train_df, order=train_df['Credit_Risk'].value_counts().index)
plt.title('Distribution of Credit Risk')
plt.show()

In [None]:
# Numerical Features Distribution
numerical_cols = ['Age', 'Income', 'Credit_Amount', 'Loan_Duration', 'Debt_to_Income', 'Credit_Score', 'Num_Credits', 'Savings_Account_Balance']
train_df[numerical_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle('Distribution of Numerical Features')
plt.show()

## 4. Data Preprocessing

In [None]:
X = train_df.drop(['ID', 'Credit_Risk'], axis=1)
y = train_df['Credit_Risk']
X_test_raw = test_df.drop(['ID'], axis=1)

# Encode Target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Define Preprocessing Pipeline
numerical_cols = ['Age', 'Income', 'Credit_Amount', 'Loan_Duration', 'Debt_to_Income', 'Credit_Score', 'Num_Credits', 'Savings_Account_Balance']
categorical_cols = ['Gender', 'Employment_Status', 'Education_Level', 'Marital_Status', 'Housing_Type', 'Loan_Purpose']

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split Data
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Apply Preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test_raw)

print("Preprocessing complete.")

## 5. Modeling (Case 9)
**Algorithm**: Gradient Boosting Classifier
**Hyperparameters**:
- `n_estimators`: [50, 100, 200]
- `learning_rate`: [0.01, 0.1, 0.2]
- `max_depth`: [3, 5, 7]

In [None]:
gbc = GradientBoostingClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train_processed, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print(f"Best Parameters: {best_params}")

## 6. Evaluation

In [None]:
y_pred = best_model.predict(X_val_processed)
acc = accuracy_score(y_val, y_pred)

print(f"Validation Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

## 7. Submission

In [None]:
predictions = best_model.predict(X_test_processed)
predictions_decoded = le.inverse_transform(predictions)

submission = pd.DataFrame({'ID': test_df['ID'], 'Credit_Risk': predictions_decoded})
submission.to_csv('../submission.csv', index=False)
print("Submission saved to submission.csv")

## 8. Credit Risk Predict for New Client

In [None]:
# Function to predict credit risk for a new client
def predict_credit_risk(client_data, model, preprocessor, label_encoder):
    # Preprocess the client data
    client_df = pd.DataFrame([client_data])
    client_processed = preprocessor.transform(client_df)
    
    # Predict
    prediction = model.predict(client_processed)[0]
    prediction_proba = model.predict_proba(client_processed)[0]
    
    # Decode prediction
    risk_label = label_encoder.inverse_transform([prediction])[0]
    
    return risk_label, prediction_proba

# Example usage with sample data
sample_client = {
    'Age': 35,
    'Income': 50000,
    'Credit_Amount': 10000,
    'Loan_Duration': 12,
    'Debt_to_Income': 0.3,
    'Credit_Score': 650,
    'Num_Credits': 1,
    'Savings_Account_Balance': 5000,
    'Gender': 'Male',
    'Employment_Status': 'Employed',
    'Education_Level': 'Bachelor',
    'Marital_Status': 'Married',
    'Housing_Type': 'Own',
    'Loan_Purpose': 'Home'
}

print("Sample prediction for a new client:")
try:
    risk, probabilities = predict_credit_risk(sample_client, best_model, preprocessor, le)
    print(f"Predicted Credit Risk: {risk}")
    print(f"Prediction Probabilities: {probabilities}")
    
    # Provide recommendation based on risk
    print("Recommendation:")
    if risk == 'Low':
        print("  OK Credit can be issued - low risk")
    elif risk == 'Medium':
        print("  ? Credit can be issued with caution - medium risk")
    else:
        print("  X Deny credit - high risk")
        
except Exception as e:
    print(f"Error during prediction: {e}")
    print("Note: This might fail if the sample client data format doesn't match the training data.")