# Credit Card Limit Prediction

This notebook implements a machine learning model to predict credit card limits using customer data.

## Dataset
Download the dataset from: https://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers
Place the `BankChurners.csv` file in the same directory as this notebook.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## Step 1: Load and Explore the Dataset


In [None]:
# Load the dataset
df = pd.read_csv('BankChurners.csv')

# Basic data exploration
print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nFirst 5 rows:")
print(df.head())


In [None]:
# Dataset info
print("Dataset info:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())


## Step 2: Data Preprocessing


In [None]:
# Select target and features
y = df['Credit_Limit']

# Select the specified input features
features_to_use = [
    'Customer_Age',
    'Dependent_count', 
    'Education_Level',
    'Income_Category',
    'Months_on_book',
    'Total_Relationship_Count',
    'Total_Trans_Amt',
    'Marital_Status'
]

X = df[features_to_use].copy()

print("Selected features:")
print(features_to_use)
print(f"\nTarget variable (Credit_Limit) range: ${y.min():,.2f} - ${y.max():,.2f}")


In [None]:
# Check for missing values in selected features
print("Missing values per column:")
print(X.isnull().sum())

# Check unique values in categorical columns
print("\nUnique values in categorical columns:")
for col in ['Education_Level', 'Income_Category', 'Marital_Status']:
    print(f"\n{col}:")
    print(X[col].value_counts())


In [None]:
# Handle categorical variables with ordinal encoding

# Education Level encoding
education_mapping = {
    'Unknown': 0,
    'Uneducated': 1,
    'High School': 2,
    'College': 3,
    'Graduate': 4,
    'Post-Graduate': 5,
    'Doctorate': 6
}
X['Education_Level'] = X['Education_Level'].map(education_mapping)

# Income Category encoding
income_mapping = {
    'Unknown': 0,
    'Less than $40K': 1,
    '$40K - $60K': 2,
    '$60K - $80K': 3,
    '$80K - $120K': 4,
    '$120K +': 5
}
X['Income_Category'] = X['Income_Category'].map(income_mapping)

# Marital Status encoding
marital_mapping = {
    'Unknown': 0,
    'Single': 1,
    'Married': 2,
    'Divorced': 3
}
X['Marital_Status'] = X['Marital_Status'].map(marital_mapping)

# Fill any NaN values that resulted from mapping
X = X.fillna(0)

print("Categorical encoding complete!")
print("\nEncoded features preview:")
print(X.head())


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data split complete!")
print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")


## Step 3: Implement Multiple Regression Models

We'll implement all the regression methods from the reference table:
1. Linear Regression and Polynomial Regression
2. Ridge and Lasso Regression (with regularization)
3. Decision Tree and Random Forest Regression
4. Multi-layer Perceptron (Neural Network) Regression


In [None]:
# Import all necessary regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Dictionary to store all models and their results
models = {}
results = {}


### 1. Linear Regression and Polynomial Regression


In [None]:
# 1. Linear Regression
print("Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)

models['Linear Regression'] = lr_model
results['Linear Regression'] = {
    'R2': r2_score(y_test, lr_pred),
    'MSE': mean_squared_error(y_test, lr_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lr_pred)),
    'MAE': mean_absolute_error(y_test, lr_pred)
}

print(f"Linear Regression - R²: {results['Linear Regression']['R2']:.4f}")

# 2. Polynomial Regression (degree 2)
print("\nTraining Polynomial Regression (degree 2)...")
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train_scaled)
X_test_poly = poly_features.transform(X_test_scaled)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
poly_pred = poly_model.predict(X_test_poly)

models['Polynomial Regression'] = (poly_model, poly_features)
results['Polynomial Regression'] = {
    'R2': r2_score(y_test, poly_pred),
    'MSE': mean_squared_error(y_test, poly_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, poly_pred)),
    'MAE': mean_absolute_error(y_test, poly_pred)
}

print(f"Polynomial Regression - R²: {results['Polynomial Regression']['R2']:.4f}")


### 2. Ridge and Lasso Regression (with regularization)


In [None]:
# 3. Ridge Regression (L2 regularization)
print("Training Ridge Regression...")
ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train_scaled, y_train)
ridge_pred = ridge_model.predict(X_test_scaled)

models['Ridge Regression'] = ridge_model
results['Ridge Regression'] = {
    'R2': r2_score(y_test, ridge_pred),
    'MSE': mean_squared_error(y_test, ridge_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, ridge_pred)),
    'MAE': mean_absolute_error(y_test, ridge_pred)
}

print(f"Ridge Regression - R²: {results['Ridge Regression']['R2']:.4f}")

# 4. Lasso Regression (L1 regularization)
print("\nTraining Lasso Regression...")
lasso_model = Lasso(alpha=0.1, random_state=42, max_iter=1000)
lasso_model.fit(X_train_scaled, y_train)
lasso_pred = lasso_model.predict(X_test_scaled)

models['Lasso Regression'] = lasso_model
results['Lasso Regression'] = {
    'R2': r2_score(y_test, lasso_pred),
    'MSE': mean_squared_error(y_test, lasso_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, lasso_pred)),
    'MAE': mean_absolute_error(y_test, lasso_pred)
}

print(f"Lasso Regression - R²: {results['Lasso Regression']['R2']:.4f}")


### 3. Decision Tree and Random Forest Regression


In [None]:
# 5. Decision Tree Regression
print("Training Decision Tree Regression...")
dt_model = DecisionTreeRegressor(random_state=42, max_depth=10)
dt_model.fit(X_train_scaled, y_train)
dt_pred = dt_model.predict(X_test_scaled)

models['Decision Tree'] = dt_model
results['Decision Tree'] = {
    'R2': r2_score(y_test, dt_pred),
    'MSE': mean_squared_error(y_test, dt_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, dt_pred)),
    'MAE': mean_absolute_error(y_test, dt_pred)
}

print(f"Decision Tree - R²: {results['Decision Tree']['R2']:.4f}")

# 6. Random Forest Regression
print("\nTraining Random Forest Regression...")
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)

models['Random Forest'] = rf_model
results['Random Forest'] = {
    'R2': r2_score(y_test, rf_pred),
    'MSE': mean_squared_error(y_test, rf_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, rf_pred)),
    'MAE': mean_absolute_error(y_test, rf_pred)
}

print(f"Random Forest - R²: {results['Random Forest']['R2']:.4f}")


### 4. Multi-layer Perceptron (Neural Network) Regression


In [None]:
# 7. Multi-layer Perceptron (Neural Network) Regression
print("Training Multi-layer Perceptron Regression...")
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',
    solver='adam',
    alpha=0.001,  # L2 regularization
    learning_rate='adaptive',
    max_iter=500,
    random_state=42
)
mlp_model.fit(X_train_scaled, y_train)
mlp_pred = mlp_model.predict(X_test_scaled)

models['MLP Regression'] = mlp_model
results['MLP Regression'] = {
    'R2': r2_score(y_test, mlp_pred),
    'MSE': mean_squared_error(y_test, mlp_pred),
    'RMSE': np.sqrt(mean_squared_error(y_test, mlp_pred)),
    'MAE': mean_absolute_error(y_test, mlp_pred)
}

print(f"MLP Regression - R²: {results['MLP Regression']['R2']:.4f}")


## Step 4: Model Comparison and Evaluation


In [None]:
# Create a comprehensive results DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('R2', ascending=False)

print("Model Performance Comparison:")
print("=" * 50)
print(results_df.round(4))

# Find the best model
best_model_name = results_df.index[0]
best_r2 = results_df.loc[best_model_name, 'R2']

print(f"\n🏆 Best Model: {best_model_name}")
print(f"   R² Score: {best_r2:.4f}")
print(f"   RMSE: ${results_df.loc[best_model_name, 'RMSE']:,.2f}")
print(f"   MAE: ${results_df.loc[best_model_name, 'MAE']:,.2f}")


In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. R² Score Comparison
axes[0, 0].bar(range(len(results_df)), results_df['R2'], color='skyblue', alpha=0.7)
axes[0, 0].set_title('R² Score Comparison', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Models')
axes[0, 0].set_ylabel('R² Score')
axes[0, 0].set_xticks(range(len(results_df)))
axes[0, 0].set_xticklabels(results_df.index, rotation=45, ha='right')
axes[0, 0].grid(True, alpha=0.3)

# 2. RMSE Comparison
axes[0, 1].bar(range(len(results_df)), results_df['RMSE'], color='lightcoral', alpha=0.7)
axes[0, 1].set_title('RMSE Comparison', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Models')
axes[0, 1].set_ylabel('RMSE ($)')
axes[0, 1].set_xticks(range(len(results_df)))
axes[0, 1].set_xticklabels(results_df.index, rotation=45, ha='right')
axes[0, 1].grid(True, alpha=0.3)

# 3. MAE Comparison
axes[1, 0].bar(range(len(results_df)), results_df['MAE'], color='lightgreen', alpha=0.7)
axes[1, 0].set_title('MAE Comparison', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Models')
axes[1, 0].set_ylabel('MAE ($)')
axes[1, 0].set_xticks(range(len(results_df)))
axes[1, 0].set_xticklabels(results_df.index, rotation=45, ha='right')
axes[1, 0].grid(True, alpha=0.3)

# 4. Model Performance Heatmap
metrics = ['R2', 'MSE', 'RMSE', 'MAE']
heatmap_data = results_df[metrics].values
im = axes[1, 1].imshow(heatmap_data, cmap='RdYlGn', aspect='auto')
axes[1, 1].set_title('Performance Heatmap', fontsize=14, fontweight='bold')
axes[1, 1].set_xticks(range(len(metrics)))
axes[1, 1].set_xticklabels(metrics)
axes[1, 1].set_yticks(range(len(results_df)))
axes[1, 1].set_yticklabels(results_df.index)
axes[1, 1].set_xlabel('Metrics')

# Add colorbar
plt.colorbar(im, ax=axes[1, 1])

plt.tight_layout()
plt.show()


In [None]:
# Feature Importance Analysis (for tree-based models)
if best_model_name in ['Decision Tree', 'Random Forest']:
    print(f"\nFeature Importance Analysis for {best_model_name}:")
    print("=" * 50)
    
    if best_model_name == 'Random Forest':
        feature_importance = models[best_model_name].feature_importances_
    else:  # Decision Tree
        feature_importance = models[best_model_name].feature_importances_
    
    feature_names = features_to_use
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    print(importance_df)
    
    # Plot feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(range(len(importance_df)), importance_df['Importance'], color='lightblue')
    plt.yticks(range(len(importance_df)), importance_df['Feature'])
    plt.xlabel('Feature Importance')
    plt.title(f'Feature Importance - {best_model_name}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()


## Step 5: Model Predictions and Insights


In [None]:
# Get predictions from the best model
if best_model_name == 'Polynomial Regression':
    best_model, poly_features = models[best_model_name]
    X_test_best = poly_features.transform(X_test_scaled)
    best_predictions = best_model.predict(X_test_best)
else:
    best_model = models[best_model_name]
    best_predictions = best_model.predict(X_test_scaled)

# Create a comparison DataFrame
comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': best_predictions,
    'Error': y_test.values - best_predictions,
    'Error_Percentage': ((y_test.values - best_predictions) / y_test.values) * 100
})

print(f"Sample Predictions using {best_model_name}:")
print("=" * 60)
print(comparison_df.head(10).round(2))

print(f"\nPrediction Statistics:")
print(f"Mean Absolute Error: ${comparison_df['Error'].abs().mean():,.2f}")
print(f"Mean Absolute Percentage Error: {comparison_df['Error_Percentage'].abs().mean():.2f}%")
print(f"Max Error: ${comparison_df['Error'].max():,.2f}")
print(f"Min Error: ${comparison_df['Error'].min():,.2f}")


In [None]:
# Visualize actual vs predicted values
plt.figure(figsize=(12, 5))

# Scatter plot of actual vs predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test, best_predictions, alpha=0.6, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Credit Limit ($)')
plt.ylabel('Predicted Credit Limit ($)')
plt.title(f'Actual vs Predicted - {best_model_name}')
plt.grid(True, alpha=0.3)

# Residuals plot
plt.subplot(1, 2, 2)
residuals = y_test - best_predictions
plt.scatter(best_predictions, residuals, alpha=0.6, color='green')
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Credit Limit ($)')
plt.ylabel('Residuals ($)')
plt.title(f'Residuals Plot - {best_model_name}')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Step 6: Model Deployment and Usage

The best performing model can now be used to predict credit card limits for new customers.


In [None]:
# Function to predict credit limit for new customers
def predict_credit_limit(customer_data, model_name=None):
    """
    Predict credit limit for a new customer
    
    Parameters:
    customer_data: dict with keys matching features_to_use
    model_name: specific model to use (if None, uses best model)
    
    Returns:
    predicted_credit_limit: float
    """
    if model_name is None:
        model_name = best_model_name
    
    # Convert customer data to DataFrame
    customer_df = pd.DataFrame([customer_data])
    
    # Apply the same preprocessing
    # Education Level encoding
    education_mapping = {
        'Unknown': 0, 'Uneducated': 1, 'High School': 2, 'College': 3,
        'Graduate': 4, 'Post-Graduate': 5, 'Doctorate': 6
    }
    customer_df['Education_Level'] = customer_df['Education_Level'].map(education_mapping)
    
    # Income Category encoding
    income_mapping = {
        'Unknown': 0, 'Less than $40K': 1, '$40K - $60K': 2, '$60K - $80K': 3,
        '$80K - $120K': 4, '$120K +': 5
    }
    customer_df['Income_Category'] = customer_df['Income_Category'].map(income_mapping)
    
    # Marital Status encoding
    marital_mapping = {
        'Unknown': 0, 'Single': 1, 'Married': 2, 'Divorced': 3
    }
    customer_df['Marital_Status'] = customer_df['Marital_Status'].map(marital_mapping)
    
    # Fill any NaN values
    customer_df = customer_df.fillna(0)
    
    # Scale the features
    customer_scaled = scaler.transform(customer_df)
    
    # Make prediction
    if model_name == 'Polynomial Regression':
        model, poly_features = models[model_name]
        customer_poly = poly_features.transform(customer_scaled)
        prediction = model.predict(customer_poly)[0]
    else:
        model = models[model_name]
        prediction = model.predict(customer_scaled)[0]
    
    return max(0, prediction)  # Ensure non-negative prediction

# Example usage
print("Example: Predicting credit limit for a new customer")
print("=" * 50)

example_customer = {
    'Customer_Age': 35,
    'Dependent_count': 2,
    'Education_Level': 'Graduate',
    'Income_Category': '$60K - $80K',
    'Months_on_book': 24,
    'Total_Relationship_Count': 4,
    'Total_Trans_Amt': 2000,
    'Marital_Status': 'Married'
}

predicted_limit = predict_credit_limit(example_customer)
print(f"Predicted Credit Limit: ${predicted_limit:,.2f}")

print(f"\nUsing the best model: {best_model_name}")
print(f"Model R² Score: {best_r2:.4f}")
