Title: Popular Classification Algorithms

Random Forest

Task 1: Predict employee attrition based on job satisfaction and salary.

In [None]:
# Write your code here

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Generate Sample Data
# In a real scenario, you would load your HR dataset.
# Example: df = pd.read_csv('hr_data.csv')

np.random.seed(42) # for reproducibility

num_employees = 1000

# Features
job_satisfaction = np.random.randint(1, 6, size=num_employees) # 1 (low) to 5 (high)
monthly_salary = np.random.normal(loc=5000, scale=1500, size=num_employees)
monthly_salary = np.clip(monthly_salary, 2500, 10000).astype(int) # Realistic salary range
years_at_company = np.random.randint(1, 10, size=num_employees) # 1 to 9 years
performance_rating = np.random.randint(1, 4, size=num_employees) # 1 (low) to 3 (high)
overtime = np.random.choice([0, 1], size=num_employees, p=[0.7, 0.3]) # 0: No, 1: Yes
promotion_last_5years = np.random.choice([0, 1], size=num_employees, p=[0.8, 0.2]) # 0: No, 1: Yes


# Simulate attrition (target variable)
# Logic: Higher attrition risk with low job satisfaction, lower salary, high overtime, no recent promotion, high years at company (if stuck)
attrition_probability = (
    0.15 # Baseline attrition probability
    - (job_satisfaction * 0.03) # Higher satisfaction, lower attrition
    - (monthly_salary / 10000) * 0.05 # Higher salary, lower attrition
    + (years_at_company * 0.01) # Longer tenure might increase attrition if no growth
    - (performance_rating * 0.02) # Higher performance, lower attrition (often due to promotion)
    + (overtime * 0.10) # Overtime increases attrition
    - (promotion_last_5years * 0.10) # Promotion decreases attrition
    + np.random.normal(0, 0.03, size=num_employees) # Add some noise
)

# Ensure probabilities are within [0, 1] and slightly skewed towards no attrition
attrition_probability = np.clip(attrition_probability, 0.02, 0.50) # Keep attrition rate relatively low

# Generate 'attrition' (1 for attrition, 0 for no attrition)
attrition = (np.random.rand(num_employees) < attrition_probability).astype(int)

# Create a Pandas DataFrame
data = pd.DataFrame({
    'job_satisfaction': job_satisfaction,
    'monthly_salary': monthly_salary,
    'years_at_company': years_at_company,
    'performance_rating': performance_rating,
    'overtime': overtime,
    'promotion_last_5years': promotion_last_5years,
    'attrition': attrition
})

print("Sample of the generated dataset:")
print(data.head())
print("\nAttrition distribution:")
print(data['attrition'].value_counts())
print("-" * 50)

# 2. Define Features (X) and Target (y)
X = data.drop('attrition', axis=1)
y = data['attrition']

# 3. Split Data into Training and Testing Sets
# Using stratify=y is crucial for imbalanced datasets like attrition
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
print("-" * 50)

# --- Hyperparameter Tuning with Cross-Validation (GridSearchCV) ---
print("Starting Hyperparameter Tuning for Random Forest...")

# Define the parameter grid to search
# These are common parameters to tune for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200], # Number of trees in the forest
    'max_depth': [5, 10, 15, None], # Maximum depth of the tree
    'min_samples_leaf': [1, 5, 10], # Minimum number of samples required to be at a leaf node
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'criterion': ['gini', 'entropy']
}

# Use StratifiedKFold for cross-validation, essential for imbalanced classification
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='roc_auc', # ROC AUC is a robust metric for imbalanced classes
    n_jobs=-1, # Use all available CPU cores
    verbose=1 # Show progress
)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

print("\nHyperparameter Tuning Complete!")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best ROC AUC score (cross-validated): {grid_search.best_score_:.4f}")
print("-" * 50)

# 4. Train the Best Random Forest Model
model = grid_search.best_estimator_ # Get the best model found by GridSearchCV
print(f"Using best model with parameters: {model.get_params()}")
# The model is already fitted from GridSearchCV, but explicit fit is fine.
# model.fit(X_train, y_train) # Not strictly necessary if using best_estimator_ after fit

print("Best Random Forest Model Trained Successfully!")
print("-" * 50)

# 5. Make Predictions on the Test Set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of attrition (class 1)

print("Sample Predictions on Test Set:")
results = pd.DataFrame({
    'Actual Attrition': y_test,
    'Predicted Attrition': y_pred,
    'Attrition Probability': y_pred_proba
})
print(results.head(10)) # Display first 10 predictions
print("-" * 50)

# 6. Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=['No Attrition', 'Attrition'])
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Model Accuracy on Test Set: {accuracy:.4f}")
print(f"Model ROC AUC Score on Test Set: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("-" * 50)

# Visualize Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted No Attrition', 'Predicted Attrition'],
            yticklabels=['Actual No Attrition', 'Actual Attrition'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Employee Attrition Prediction')
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
print("-" * 50)

# 7. Feature Importance
print("Feature Importances (how much each feature contributes to prediction):")
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances.values, y=feature_importances.index, palette='viridis')
plt.title('Feature Importances for Attrition Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
print("-" * 50)

# 8. Predict for a New Employee with Input Validation
def predict_new_employee_attrition(job_satisfaction, monthly_salary, years_at_company,
                                   performance_rating, overtime, promotion_last_5years,
                                   model, feature_columns):
    # Input Validation (basic checks for logical ranges)
    if not (1 <= job_satisfaction <= 5):
        print(f"Warning: Job Satisfaction {job_satisfaction} is outside the typical range (1-5).")
    if not (monthly_salary >= 0):
        print(f"Warning: Monthly Salary {monthly_salary} should be non-negative.")
    if not (years_at_company >= 0):
        print(f"Warning: Years at Company {years_at_company} should be non-negative.")
    if not (1 <= performance_rating <= 3):
        print(f"Warning: Performance Rating {performance_rating} is outside the typical range (1-3).")
    if overtime not in [0, 1]:
        print(f"Warning: Overtime {overtime} should be 0 (No) or 1 (Yes).")
    if promotion_last_5years not in [0, 1]:
        print(f"Warning: Promotion Last 5 Years {promotion_last_5years} should be 0 (No) or 1 (Yes).")

    new_employee_data = pd.DataFrame([[
        job_satisfaction, monthly_salary, years_at_company,
        performance_rating, overtime, promotion_last_5years
    ]], columns=feature_columns)

    prediction = model.predict(new_employee_data)[0]
    probability = model.predict_proba(new_employee_data)[0][1] # Probability of attrition (class 1)

    status = "HIGH Risk of Attrition" if prediction == 1 else "LOW Risk of Attrition"
    return status, probability

# Test with some new employee examples
print("\nPredicting for new employees with input validation:")

# Example 1: High risk employee (low satisfaction, high overtime, no promotion)
status1, prob1 = predict_new_employee_attrition(
    job_satisfaction=2, monthly_salary=4000, years_at_company=3,
    performance_rating=2, overtime=1, promotion_last_5years=0,
    model=model, feature_columns=X.columns
)
print(f"Employee 1 - Predicted Status: {status1}, Probability of Attrition: {prob1:.4f}")

# Example 2: Low risk employee (high satisfaction, good salary, promotion)
status2, prob2 = predict_new_employee_attrition(
    job_satisfaction=5, monthly_salary=8000, years_at_company=5,
    performance_rating=3, overtime=0, promotion_last_5years=1,
    model=model, feature_columns=X.columns
)
print(f"Employee 2 - Predicted Status: {status2}, Probability of Attrition: {prob2:.4f}")

# Example 3: Borderline case (moderate satisfaction, average salary, no overtime)
status3, prob3 = predict_new_employee_attrition(
    job_satisfaction=3, monthly_salary=5500, years_at_company=4,
    performance_rating=2, overtime=0, promotion_last_5years=0,
    model=model, feature_columns=X.columns
)
print(f"Employee 3 - Predicted Status: {status3}, Probability of Attrition: {prob3:.4f}")

# Example 4: With a problematic input (will trigger warning)
status4, prob4 = predict_new_employee_attrition(
    job_satisfaction=6, monthly_salary=-1000, years_at_company=12,
    performance_rating=0, overtime=2, promotion_last_5years=-1,
    model=model, feature_columns=X.columns
)
print(f"Employee 4 - Predicted Status: {status4}, Probability of Attrition: {prob4:.4f}")

print("-" * 50)

print("\n--- Summary of Random Forest Features and Best Practices ---")
print("1. **Ensemble Learning**: Random Forest is an ensemble method, building multiple decision trees and combining their predictions. This reduces overfitting and improves generalization compared to a single decision tree.")
print("2. **Robustness to Overfitting**: By averaging multiple trees, Random Forest is less prone to overfitting than individual decision trees.")
print("3. **Feature Importance**: It provides a reliable measure of feature importance, helping identify which factors are most influential in predicting attrition.")
print("4. **Hyperparameter Tuning**: `GridSearchCV` with `StratifiedKFold` is used for systematic hyperparameter tuning, finding optimal `n_estimators`, `max_depth`, `min_samples_leaf`, etc., to get the best model performance.")
print("5. **Evaluation Metrics**: ROC AUC is used as the primary scoring metric during tuning, as it's more robust for imbalanced datasets typical in attrition prediction.")
print("6. **Input Validation**: Added basic input validation to the prediction function to ensure new data conforms to expected ranges, making the model more robust in a practical setting.")
print("7. **Scalability**: Random Forests are computationally efficient and can handle large datasets well.")


Sample of the generated dataset:
   job_satisfaction  monthly_salary  years_at_company  performance_rating  \
0                 4            5164                 6                   2   
1                 5            6088                 7                   3   
2                 3            5721                 4                   3   
3                 5            5335                 7                   3   
4                 5            3814                 9                   2   

   overtime  promotion_last_5years  attrition  
0         0                      0          0  
1         0                      1          1  
2         1                      1          0  
3         0                      0          0  
4         0                      0          0  

Attrition distribution:
attrition
0    926
1     74
Name: count, dtype: int64
--------------------------------------------------
Training set size: 700 samples
Testing set size: 300 samples
-------------------------

Task 2: Classify types of wine based on chemical analysis.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Generate Sample Data
# In a real scenario, you would load a dataset containing loan application details.
# Example: df = pd.read_csv('loan_applications.csv')

np.random.seed(42) # for reproducibility

num_applicants = 1200 # You can temporarily reduce this for faster testing, e.g., 500

# Features
credit_score = np.random.normal(loc=680, scale=70, size=num_applicants).astype(int)
credit_score = np.clip(credit_score, 300, 850) # Ensure scores are within valid FICO range

annual_income_k = np.random.normal(loc=80, scale=40, size=num_applicants) # in thousands
annual_income_k = np.clip(annual_income_k, 30, 250) # Min 30k, Max 250k

loan_amount_k = np.random.normal(loc=250, scale=100, size=num_applicants) # in thousands
loan_amount_k = np.clip(loan_amount_k, 50, 700) # Min 50k, Max 700k

employment_status = np.random.choice(['Employed', 'Self-Employed', 'Unemployed', 'Retired'],
                                     size=num_applicants, p=[0.7, 0.15, 0.1, 0.05])

property_value_k = np.random.normal(loc=300, scale=150, size=num_applicants) # in thousands
property_value_k = np.clip(property_value_k, 100, 1000) # Min 100k, Max 1000k

debt_to_income_ratio = np.random.normal(loc=0.36, scale=0.1)
debt_to_income_ratio = np.clip(debt_to_income_ratio, 0.05, 0.6) # Realistic DTI range

# Simulate loan approval (target variable)
# Logic: Approval more likely with high credit score, high income, low loan amount relative to property value,
# stable employment, and low DTI.
approval_probability = (
    0.6 # Baseline approval probability
    + (credit_score / 1000) * 0.2 # Higher credit score, higher chance
    + (annual_income_k / 200) * 0.15 # Higher income, higher chance
    - (loan_amount_k / property_value_k) * 0.2 # Higher loan/property ratio, lower chance
    - (debt_to_income_ratio * 0.4) # Higher DTI, lower chance
    + np.where(employment_status == 'Employed', 0.1, 0) # Employed has higher chance
    + np.where(employment_status == 'Self-Employed', 0.05, 0) # Self-employed slightly higher
    - np.where(employment_status == 'Unemployed', 0.3, 0) # Unemployed much lower
    + np.random.normal(0, 0.05, size=num_applicants) # Add some noise
)

# Ensure probabilities are within [0, 1]
approval_probability = np.clip(approval_probability, 0.05, 0.95)

# Generate 'loan_approved' (1 for approved, 0 for denied)
loan_approved = (np.random.rand(num_applicants) < approval_probability).astype(int)

# Create a Pandas DataFrame
data = pd.DataFrame({
    'credit_score': credit_score,
    'annual_income_k': annual_income_k,
    'loan_amount_k': loan_amount_k,
    'employment_status': employment_status,
    'property_value_k': property_value_k,
    'debt_to_income_ratio': debt_to_income_ratio,
    'loan_approved': loan_approved
})

print("Sample of the generated dataset:")
print(data.head())
print("\nLoan Approval distribution:")
print(data['loan_approved'].value_counts())
print("-" * 50)

# 2. Preprocess Data: One-hot encode categorical features
data_encoded = pd.get_dummies(data, columns=['employment_status'], drop_first=False) # Keep all dummies

# Define Features (X) and Target (y)
X = data_encoded.drop('loan_approved', axis=1)
y = data_encoded['loan_approved'] # Target variable

# Get feature names after one-hot encoding for consistency
feature_names = X.columns.tolist()

print("\nSample of the encoded dataset (features only):")
print(X.head())
print("-" * 50)

# 3. Split Data into Training and Testing Sets
# Using stratify=y is crucial for imbalanced datasets like loan approval/denial
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
print("-" * 50)

# --- Hyperparameter Tuning with Cross-Validation (GridSearchCV) ---
print("Starting Hyperparameter Tuning for Random Forest...")

# Define the parameter grid to search
# SUGGESTION: If GridSearchCV takes too long, reduce the number of options in param_grid.
# For example, use:
# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [5, 10],
#     'min_samples_leaf': [1, 5],
#     'criterion': ['gini']
# }
param_grid = {
    'n_estimators': [100, 200, 300], # Number of trees in the forest
    'max_depth': [5, 10, 15, None], # Maximum depth of the tree
    'min_samples_leaf': [1, 5, 10], # Minimum number of samples required to be at a leaf node
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'criterion': ['gini', 'entropy']
}

# Use StratifiedKFold for cross-validation, essential for imbalanced classification
# SUGGESTION: If it's still too slow, reduce n_splits, e.g., n_splits=3
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='roc_auc', # ROC AUC is a robust metric for imbalanced classes
    n_jobs=-1, # Use all available CPU cores. This can be slow if you have many cores and limited RAM.
              # If you suspect memory issues, try n_jobs=1 or n_jobs=2.
    verbose=1 # Show progress
)

# Fit GridSearchCV to the training data
# This is the line that might be interrupted if it takes too long.
grid_search.fit(X_train, y_train)

print("\nHyperparameter Tuning Complete!")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best ROC AUC score (cross-validated): {grid_search.best_score_:.4f}")
print("-" * 50)

# 4. Train the Best Random Forest Model
model = grid_search.best_estimator_ # Get the best model found by GridSearchCV
print(f"Using best model with parameters: {model.get_params()}")
# The model is already fitted from GridSearchCV's .fit() call

print("Best Random Forest Model Trained Successfully!")
print("-" * 50)

# 5. Make Predictions on the Test Set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of loan approval (class 1)

print("Sample Predictions on Test Set:")
results = pd.DataFrame({
    'Actual Approval': y_test,
    'Predicted Approval': y_pred,
    'Approval Probability': y_pred_proba
})
print(results.head(10)) # Display first 10 predictions
print("-" * 50)

# 6. Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=['Denied', 'Approved'])
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Model Accuracy on Test Set: {accuracy:.4f}")
print(f"Model ROC AUC Score on Test Set: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("-" * 50)

# Visualize Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Denied', 'Predicted Approved'],
            yticklabels=['Actual Denied', 'Actual Approved'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Housing Loan Approval Prediction')
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
print("-" * 50)

# 7. Feature Importance
print("Feature Importances (how much each feature contributes to prediction):")
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

plt.figure(figsize=(12, 7))
sns.barplot(x=feature_importances.values, y=feature_importances.index, palette='crest')
plt.title('Feature Importances for Housing Loan Approval Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
print("-" * 50)

# 8. Predict for a New Applicant with Input Validation
def predict_loan_approval(credit_score, annual_income_k, loan_amount_k, employment_status,
                          property_value_k, debt_to_income_ratio,
                          model, feature_columns):
    # Input Validation (basic checks for logical ranges)
    if not (300 <= credit_score <= 850):
        print(f"Warning: Credit Score {credit_score} is outside the typical range (300-850).")
    if not (annual_income_k >= 0):
        print(f"Warning: Annual Income {annual_income_k} should be non-negative.")
    if not (loan_amount_k >= 0):
        print(f"Warning: Loan Amount {loan_amount_k} should be non-negative.")
    if employment_status not in ['Employed', 'Self-Employed', 'Unemployed', 'Retired']:
        print(f"Warning: Invalid Employment Status '{employment_status}'. Must be one of: Employed, Self-Employed, Unemployed, Retired.")
        return "Invalid Input", 0.0
    if not (property_value_k >= 0):
        print(f"Warning: Property Value {property_value_k} should be non-negative.")
    if not (0 <= debt_to_income_ratio <= 1):
        print(f"Warning: Debt-to-Income Ratio {debt_to_income_ratio} is outside the typical range (0-1).")

    # Create a DataFrame for the new applicant
    new_applicant_df = pd.DataFrame([{
        'credit_score': credit_score,
        'annual_income_k': annual_income_k,
        'loan_amount_k': loan_amount_k,
        'employment_status': employment_status,
        'property_value_k': property_value_k,
        'debt_to_income_ratio': debt_to_income_ratio
    }])

    # One-hot encode the new applicant's categorical features
    new_applicant_encoded = pd.get_dummies(new_applicant_df, columns=['employment_status'], drop_first=False)

    # Reindex to ensure columns match the training data's feature_columns order
    # Fill missing columns (e.g., other employment_status categories not present in this single sample) with 0
    new_applicant_processed = new_applicant_encoded.reindex(columns=feature_columns, fill_value=0)

    # Predict
    prediction = model.predict(new_applicant_processed)[0]
    probability = model.predict_proba(new_applicant_processed)[0][1] # Probability of approval (class 1)

    status = "APPROVED" if prediction == 1 else "DENIED"
    return status, probability

# Test with some new applicant examples
print("\nPredicting for new loan applicants with input validation:")

# Example 1: High chance of approval
status1, prob1 = predict_loan_approval(
    credit_score=750, annual_income_k=120, loan_amount_k=200, employment_status='Employed',
    property_value_k=400, debt_to_income_ratio=0.25,
    model=model, feature_columns=X.columns
)
print(f"Applicant 1 - Predicted Status: {status1}, Probability of Approval: {prob1:.4f}")

# Example 2: High chance of denial
status2, prob2 = predict_loan_approval(
    credit_score=550, annual_income_k=40, loan_amount_k=300, employment_status='Unemployed',
    property_value_k=350, debt_to_income_ratio=0.55,
    model=model, feature_columns=X.columns
)
print(f"Applicant 2 - Predicted Status: {status2}, Probability of Approval: {prob2:.4f}")

# Example 3: Borderline case
status3, prob3 = predict_loan_approval(
    credit_score=680, annual_income_k=70, loan_amount_k=280, employment_status='Self-Employed',
    property_value_k=300, debt_to_income_ratio=0.40,
    model=model, feature_columns=X.columns
)
print(f"Applicant 3 - Predicted Status: {status3}, Probability of Approval: {prob3:.4f}")

# Example 4: With problematic input (will trigger warnings)
status4, prob4 = predict_loan_approval(
    credit_score=900, annual_income_k=-50, loan_amount_k=1000, employment_status='Student', # Invalid status
    property_value_k=0, debt_to_income_ratio=1.5,
    model=model, feature_columns=X.columns
)
print(f"Applicant 4 - Predicted Status: {status4}, Probability of Approval: {prob4:.4f}")

print("-" * 50)

print("\n--- Summary of Housing Loan Approval Prediction with Random Forest ---")
print("1. **Data Generation**: Simulated realistic financial and personal data for loan applicants, including categorical employment status.")
print("2. **Preprocessing**: Used one-hot encoding for the `employment_status` categorical feature, which is essential for Random Forest.")
print("3. **Hyperparameter Tuning**: Applied `GridSearchCV` with `StratifiedKFold` to find the optimal Random Forest parameters, focusing on `roc_auc` for robust evaluation in binary classification.")
print("   - *Note on KeyboardInterrupt*: If this step takes too long, reduce the complexity of the `param_grid` or `n_splits` as suggested in the comments.")
print("4. **Model Evaluation**: Provided comprehensive evaluation metrics including accuracy, confusion matrix, classification report, and ROC curve, which are critical for assessing model performance in a sensitive application like loan approval.")
print("5. **Feature Importance**: Identified the most influential factors (e.g., credit score, DTI, income) in determining loan approval, offering valuable insights for lenders.")
print("6. **Prediction Function with Validation**: Developed a `predict_loan_approval` function that includes input validation and correctly handles one-hot encoding for new, unseen data, making it more robust for deployment.")

Original Wine Dataset Info:
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  p

KeyboardInterrupt: 

Task 3: Predict housing loan approval based on financial and personal data.

In [None]:
# Write your code here

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Generate Sample Data
# In a real scenario, you would load a dataset containing loan application details.
# Example: df = pd.read_csv('loan_applications.csv')

np.random.seed(42) # for reproducibility

num_applicants = 1200

# Features
credit_score = np.random.normal(loc=680, scale=70, size=num_applicants).astype(int)
credit_score = np.clip(credit_score, 300, 850) # Ensure scores are within valid FICO range

annual_income_k = np.random.normal(loc=80, scale=40, size=num_applicants) # in thousands
annual_income_k = np.clip(annual_income_k, 30, 250) # Min 30k, Max 250k

loan_amount_k = np.random.normal(loc=250, scale=100, size=num_applicants) # in thousands
loan_amount_k = np.clip(loan_amount_k, 50, 700) # Min 50k, Max 700k

employment_status = np.random.choice(['Employed', 'Self-Employed', 'Unemployed', 'Retired'],
                                     size=num_applicants, p=[0.7, 0.15, 0.1, 0.05])

property_value_k = np.random.normal(loc=300, scale=150, size=num_applicants) # in thousands
property_value_k = np.clip(property_value_k, 100, 1000) # Min 100k, Max 1000k

debt_to_income_ratio = np.random.normal(loc=0.36, scale=0.1)
debt_to_income_ratio = np.clip(debt_to_income_ratio, 0.05, 0.6) # Realistic DTI range

# Simulate loan approval (target variable)
# Logic: Approval more likely with high credit score, high income, low loan amount relative to property value,
# stable employment, and low DTI.
approval_probability = (
    0.6 # Baseline approval probability
    + (credit_score / 1000) * 0.2 # Higher credit score, higher chance
    + (annual_income_k / 200) * 0.15 # Higher income, higher chance
    - (loan_amount_k / property_value_k) * 0.2 # Higher loan/property ratio, lower chance
    - (debt_to_income_ratio * 0.4) # Higher DTI, lower chance
    + np.where(employment_status == 'Employed', 0.1, 0) # Employed has higher chance
    + np.where(employment_status == 'Self-Employed', 0.05, 0) # Self-employed slightly higher
    - np.where(employment_status == 'Unemployed', 0.3, 0) # Unemployed much lower
    + np.random.normal(0, 0.05, size=num_applicants) # Add some noise
)

# Ensure probabilities are within [0, 1]
approval_probability = np.clip(approval_probability, 0.05, 0.95)

# Generate 'loan_approved' (1 for approved, 0 for denied)
loan_approved = (np.random.rand(num_applicants) < approval_probability).astype(int)

# Create a Pandas DataFrame
data = pd.DataFrame({
    'credit_score': credit_score,
    'annual_income_k': annual_income_k,
    'loan_amount_k': loan_amount_k,
    'employment_status': employment_status,
    'property_value_k': property_value_k,
    'debt_to_income_ratio': debt_to_income_ratio,
    'loan_approved': loan_approved
})

print("Sample of the generated dataset:")
print(data.head())
print("\nLoan Approval distribution:")
print(data['loan_approved'].value_counts())
print("-" * 50)

# 2. Preprocess Data: One-hot encode categorical features
data_encoded = pd.get_dummies(data, columns=['employment_status'], drop_first=False) # Keep all dummies

# Define Features (X) and Target (y)
X = data_encoded.drop('loan_approved', axis=1)
y = data_encoded['loan_approved'] # Target variable

# Get feature names after one-hot encoding for consistency
feature_names = X.columns.tolist()

print("\nSample of the encoded dataset (features only):")
print(X.head())
print("-" * 50)

# 3. Split Data into Training and Testing Sets
# Using stratify=y is crucial for imbalanced datasets like loan approval/denial
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")
print("-" * 50)

# --- Hyperparameter Tuning with Cross-Validation (GridSearchCV) ---
print("Starting Hyperparameter Tuning for Random Forest...")

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300], # Number of trees in the forest
    'max_depth': [5, 10, 15, None], # Maximum depth of the tree
    'min_samples_leaf': [1, 5, 10], # Minimum number of samples required to be at a leaf node
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'criterion': ['gini', 'entropy']
}

# Use StratifiedKFold for cross-validation, essential for imbalanced classification
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='roc_auc', # ROC AUC is a robust metric for imbalanced classes
    n_jobs=-1, # Use all available CPU cores
    verbose=1 # Show progress
)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

print("\nHyperparameter Tuning Complete!")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best ROC AUC score (cross-validated): {grid_search.best_score_:.4f}")
print("-" * 50)

# 4. Train the Best Random Forest Model
model = grid_search.best_estimator_ # Get the best model found by GridSearchCV
print(f"Using best model with parameters: {model.get_params()}")
# The model is already fitted from GridSearchCV's .fit() call

print("Best Random Forest Model Trained Successfully!")
print("-" * 50)

# 5. Make Predictions on the Test Set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1] # Probability of loan approval (class 1)

print("Sample Predictions on Test Set:")
results = pd.DataFrame({
    'Actual Approval': y_test,
    'Predicted Approval': y_pred,
    'Approval Probability': y_pred_proba
})
print(results.head(10)) # Display first 10 predictions
print("-" * 50)

# 6. Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=['Denied', 'Approved'])
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Model Accuracy on Test Set: {accuracy:.4f}")
print(f"Model ROC AUC Score on Test Set: {roc_auc:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)
print("-" * 50)

# Visualize Confusion Matrix
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted Denied', 'Predicted Approved'],
            yticklabels=['Actual Denied', 'Actual Approved'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Housing Loan Approval Prediction')
plt.show()

# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
print("-" * 50)

# 7. Feature Importance
print("Feature Importances (how much each feature contributes to prediction):")
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

plt.figure(figsize=(12, 7))
sns.barplot(x=feature_importances.values, y=feature_importances.index, palette='crest')
plt.title('Feature Importances for Housing Loan Approval Prediction')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()
print("-" * 50)

# 8. Predict for a New Applicant with Input Validation
def predict_loan_approval(credit_score, annual_income_k, loan_amount_k, employment_status,
                          property_value_k, debt_to_income_ratio,
                          model, feature_columns):
    # Input Validation (basic checks for logical ranges)
    if not (300 <= credit_score <= 850):
        print(f"Warning: Credit Score {credit_score} is outside the typical range (300-850).")
    if not (annual_income_k >= 0):
        print(f"Warning: Annual Income {annual_income_k} should be non-negative.")
    if not (loan_amount_k >= 0):
        print(f"Warning: Loan Amount {loan_amount_k} should be non-negative.")
    if employment_status not in ['Employed', 'Self-Employed', 'Unemployed', 'Retired']:
        print(f"Warning: Invalid Employment Status '{employment_status}'. Must be one of: Employed, Self-Employed, Unemployed, Retired.")
        return "Invalid Input", 0.0
    if not (property_value_k >= 0):
        print(f"Warning: Property Value {property_value_k} should be non-negative.")
    if not (0 <= debt_to_income_ratio <= 1):
        print(f"Warning: Debt-to-Income Ratio {debt_to_income_ratio} is outside the typical range (0-1).")

    # Create a DataFrame for the new applicant
    new_applicant_df = pd.DataFrame([{
        'credit_score': credit_score,
        'annual_income_k': annual_income_k,
        'loan_amount_k': loan_amount_k,
        'employment_status': employment_status,
        'property_value_k': property_value_k,
        'debt_to_income_ratio': debt_to_income_ratio
    }])

    # One-hot encode the new applicant's categorical features
    new_applicant_encoded = pd.get_dummies(new_applicant_df, columns=['employment_status'], drop_first=False)

    # Reindex to ensure columns match the training data's feature_columns order
    # Fill missing columns (e.g., other employment_status categories not present in this single sample) with 0
    new_applicant_processed = new_applicant_encoded.reindex(columns=feature_columns, fill_value=0)

    # Predict
    prediction = model.predict(new_applicant_processed)[0]
    probability = model.predict_proba(new_applicant_processed)[0][1] # Probability of approval (class 1)

    status = "APPROVED" if prediction == 1 else "DENIED"
    return status, probability

# Test with some new applicant examples
print("\nPredicting for new loan applicants with input validation:")

# Example 1: High chance of approval
status1, prob1 = predict_loan_approval(
    credit_score=750, annual_income_k=120, loan_amount_k=200, employment_status='Employed',
    property_value_k=400, debt_to_income_ratio=0.25,
    model=model, feature_columns=X.columns
)
print(f"Applicant 1 - Predicted Status: {status1}, Probability of Approval: {prob1:.4f}")

# Example 2: High chance of denial
status2, prob2 = predict_loan_approval(
    credit_score=550, annual_income_k=40, loan_amount_k=300, employment_status='Unemployed',
    property_value_k=350, debt_to_income_ratio=0.55,
    model=model, feature_columns=X.columns
)
print(f"Applicant 2 - Predicted Status: {status2}, Probability of Approval: {prob2:.4f}")

# Example 3: Borderline case
status3, prob3 = predict_loan_approval(
    credit_score=680, annual_income_k=70, loan_amount_k=280, employment_status='Self-Employed',
    property_value_k=300, debt_to_income_ratio=0.40,
    model=model, feature_columns=X.columns
)
print(f"Applicant 3 - Predicted Status: {status3}, Probability of Approval: {prob3:.4f}")

# Example 4: With problematic input (will trigger warnings)
status4, prob4 = predict_loan_approval(
    credit_score=900, annual_income_k=-50, loan_amount_k=1000, employment_status='Student', # Invalid status
    property_value_k=0, debt_to_income_ratio=1.5,
    model=model, feature_columns=X.columns
)
print(f"Applicant 4 - Predicted Status: {status4}, Probability of Approval: {prob4:.4f}")

print("-" * 50)

print("\n--- Summary of Housing Loan Approval Prediction with Random Forest ---")
print("1. **Data Generation**: Simulated realistic financial and personal data for loan applicants, including categorical employment status.")
print("2. **Preprocessing**: Used one-hot encoding for the `employment_status` categorical feature, which is essential for Random Forest.")
print("3. **Hyperparameter Tuning**: Applied `GridSearchCV` with `StratifiedKFold` to find the optimal Random Forest parameters, focusing on `roc_auc` for robust evaluation in binary classification.")
print("4. **Model Evaluation**: Provided comprehensive evaluation metrics including accuracy, confusion matrix, classification report, and ROC curve, which are critical for assessing model performance in a sensitive application like loan approval.")
print("5. **Feature Importance**: Identified the most influential factors (e.g., credit score, DTI, income) in determining loan approval, offering valuable insights for lenders.")
print("6. **Prediction Function with Validation**: Developed a `predict_loan_approval` function that includes input validation and correctly handles one-hot encoding for new, unseen data, making it more robust for deployment.")