# Importing necessary libraries


In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from lightgbm import LGBMClassifier
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Loading dataset 

The dataset for this competition (both train and test) was generated from a deep learning model trained on the Loan Prediction dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.



In [None]:
df=pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
df

In [None]:
test =pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv') 
test

In [None]:
df.info()

In [None]:
df.describe(include='all')

# Univarient analysis

In [None]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("Numerical Columns:", num_cols.tolist())
print("Categorical Columns:", cat_cols.tolist())


# Numerical column

In [None]:
for col in num_cols:
    plt.figure(figsize=(12,4))
    
    plt.subplot(1,2,1)
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col}')
    
    plt.subplot(1,2,2)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
    
    plt.show()


# Categorical column

In [None]:
for col in cat_cols:
    plt.figure(figsize=(8,4))
    sns.countplot(y=df[col], order=df[col].value_counts().index)
    plt.title(f'Count Plot of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.show()


# Biavarient analysis 

In [None]:
# Scatterplots between key numeric pairs
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

for i in range(len(num_cols)):
    for j in range(i+1, len(num_cols)):
        plt.figure(figsize=(6,4))
        sns.scatterplot(x=df[num_cols[i]], y=df[num_cols[j]])
        plt.title(f'{num_cols[i]} vs {num_cols[j]}')
        plt.show()


In [None]:
# Correlation heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df[num_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap (Numerical Variables)')
plt.show()


In [None]:
cat_cols = df.select_dtypes(include=['object']).columns

for cat in cat_cols:
    for num in num_cols:
        plt.figure(figsize=(8,4))
        sns.boxplot(x=cat, y=num, data=df)
        plt.title(f'{num} vs {cat}')
        plt.xticks(rotation=45)
        plt.show()


In [None]:
# Categorical vs Categorical (corrected)
for i in range(len(cat_cols)):
    for j in range(i+1, len(cat_cols)):
        ct = pd.crosstab(df[cat_cols[i]], df[cat_cols[j]])
        plt.figure(figsize=(8,5))
        sns.heatmap(ct, annot=False, cmap='YlGnBu')
        plt.title(f'{cat_cols[i]} vs {cat_cols[j]}')
        plt.xlabel(cat_cols[j])
        plt.ylabel(cat_cols[i])
        plt.show()


# Multivaient analysis

In [None]:
sns.pairplot(df[num_cols], diag_kind='kde')
plt.suptitle('Pairplot of All Numeric Variables', y=1.02)
plt.show()


In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    data=df,
    x='annual_income',
    y='loan_amount',
    hue='grade_subgrade',
    alpha=0.7
)
plt.title('Annual Income vs Loan Amount by Grade')
plt.show()


In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(
    data=df,
    x='education_level',
    y='loan_amount',
    hue='marital_status'
)
plt.title('Loan Amount by Education Level and Marital Status')
plt.xticks(rotation=45)
plt.show()


In [None]:
from pandas.plotting import parallel_coordinates

subset_cols = ['grade_subgrade', 'annual_income', 'credit_score', 'loan_amount', 'interest_rate']
plt.figure(figsize=(12,6))
parallel_coordinates(df[subset_cols].sample(500, random_state=42), 'grade_subgrade', colormap='viridis')
plt.title('Parallel Coordinates Plot - Multivariate View')
plt.xticks(rotation=30)
plt.show()


# Making pipeline 

In [None]:
X = df.drop(columns=['loan_paid_back'])  
y = df['loan_paid_back']



In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

print("Numerical Columns:", num_cols.tolist())
print("Categorical Columns:", cat_cols.tolist())


# üß© Data Preprocessing Pipeline

Before training the model, a preprocessing pipeline was built to handle both **numerical** and **categorical** features efficiently.  
This ensures the dataset is clean, consistent, and properly scaled for machine learning algorithms.

### üî¢ Numerical Transformer
For all numerical columns (`num_cols`), we applied:
1. **Imputer (Median Strategy)** ‚Äì Replaces missing numeric values with the median of the respective column, reducing the effect of outliers.  
2. **Scaler (StandardScaler)** ‚Äì Standardizes the features to have zero mean and unit variance for better model performance.

### üî† Categorical Transformer
For all categorical columns (`cat_cols`), we applied:
1. **Imputer (Most Frequent Strategy)** ‚Äì Fills missing values with the most frequent category in each column.  
2. **Encoder (OrdinalEncoder)** ‚Äì Converts categorical features into numerical values, allowing models to interpret them efficiently.

### ‚öôÔ∏è ColumnTransformer
Both transformations are combined using a `ColumnTransformer` to apply preprocessing steps automatically to their respective feature types.

```python
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


In [None]:
# Preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OrdinalEncoder

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ])


# model 

In [None]:
# LightGBM Classifier pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=12,
        random_state=42,
        n_jobs=-1
    ))
])

In [None]:
# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Fit model
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_valid)

r2 = r2_score(y_valid, y_pred)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))

print(f"R¬≤ Score: {r2:.4f}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")


# Making prediction 

In [None]:
# Predict probabilities or class labels
test_pred = model.predict(test)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['id'],
    'loan_default': test_pred
})

# Preview first 5 rows
print(submission.head())

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("‚úÖ submission.csv saved successfully")


In [None]:
import joblib
joblib.dump(model, 'loan_default_model.pkl')


# If you found this notebook helpful, please consider giving it an upvote! üôè
