<a href="https://colab.research.google.com/github/SaiTejaPortfolioDS/SaiTejaPortfolioDS/blob/main/Project_2_Applied_ML(iteration_1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import average_precision_score, precision_recall_curve, confusion_matrix
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# ======================
# 1. DATA LOADING
# ======================
def load_data(filepath, chunksize=None):
    """Safe loading with chunking for large files"""
    try:
        if chunksize and ('csv' in filepath):
            chunks = []
            for chunk in pd.read_csv(filepath, chunksize=chunksize):
                chunks.append(chunk)
            return pd.concat(chunks, axis=0)
        return pd.read_csv(filepath)
    except Exception as e:
        print(f"Error loading {filepath}: {str(e)}")
        raise

print("Loading data...")
data = load_data('SBA_loans_project_2.csv', chunksize=10000)
holdout_data = load_data('SBA_loans_project_2_holdout_students_valid.csv')

# ======================
# 2. DATA CLEANING (Original Working Version)
# ======================
def clean_data(df):
    """Comprehensive cleaning pipeline"""
    df = df.copy()

    # Drop unnecessary columns
    cols_to_drop = ['index', 'Unnamed: 0']
    df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

    # Clean monetary columns
    money_cols = ['DisbursementGross', 'GrAppv', 'SBA_Appv']
    for col in money_cols:
        if col in df.columns:
            df[col] = (df[col].astype(str)
                       .str.replace(r'[^\d.]', '', regex=True)
                       .astype('float32'))

    # Handle categorical missing values
    cat_cols = ['RevLineCr', 'LowDoc']
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].fillna('MISSING').astype('category')

    # Convert dates
    date_cols = ['ApprovalDate', 'DisbursementDate']
    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    return df

print("Cleaning data...")
data = clean_data(data)
holdout_data = clean_data(holdout_data)

# ======================
# 3. TARGET PROCESSING (Fixed Version)
# ======================
def prepare_target(df):
    """Fixed target processing handling all cases"""
    if 'MIS_Status' not in df.columns:
        raise ValueError("Target column 'MIS_Status' not found")

    # Convert to string first to ensure str accessor works
    y_series = df['MIS_Status'].astype(str).str.upper()

    # Handle all possible cases
    mapping = {
        'PIF': 0, '0': 0,
        'PCHGB': 1, '1': 1,
        'CHGOFF': 1,  # Additional variant seen in some datasets
        'EXEMPT': np.nan, 'NAN': np.nan, '': np.nan
    }

    y = y_series.map(mapping).astype('float32')

    # Check for unmapped values
    if y.isna().any():
        unmapped = y_series[y.isna()].unique()
        print(f"Warning: Unmapped values in target: {unmapped}")
        print("These will be dropped from the dataset")
        y = y.dropna()

    return y.astype('int8'), df.loc[y.index]

print("Preparing target...")
y, data = prepare_target(data)

# ======================
# 4. FEATURE ENCODING (UPDATED)
# ======================
def encode_features(train_df, test_df=None, cat_threshold=100):
    """Adaptive encoding with proper categorical handling"""
    train_df = train_df.copy()
    if test_df is not None:
        test_df = test_df.copy()

    # Identify categoricals (including string columns)
    cat_cols = [col for col in train_df.select_dtypes(include=['category', 'object']).columns
               if col != 'MIS_Status']
    print(f"Found {len(cat_cols)} categorical features")

    # Apply encoding
    for col in cat_cols:
        # For string columns, first convert to category
        if train_df[col].dtype == 'object':
            train_df[col] = train_df[col].astype('category')
            if test_df is not None:
                test_df[col] = test_df[col].astype('category')

        # Frequency encoding for high-cardinality
        if train_df[col].nunique() > cat_threshold:
            freq_map = train_df[col].value_counts(normalize=True)
            train_df[f"{col}_freq"] = train_df[col].map(freq_map)
            if test_df is not None:
                test_df[f"{col}_freq"] = test_df[col].map(freq_map).fillna(0)
            train_df = train_df.drop(columns=[col])
            if test_df is not None:
                test_df = test_df.drop(columns=[col])
        # One-hot for low-cardinality
        else:
            train_df = pd.get_dummies(train_df, columns=[col], prefix=col, drop_first=True)
            if test_df is not None:
                test_df = pd.get_dummies(test_df, columns=[col], prefix=col, drop_first=True)

    # Align test features
    if test_df is not None:
        missing_cols = set(train_df.columns) - set(test_df.columns)
        for col in missing_cols:
            test_df[col] = 0
        test_df = test_df[train_df.columns]
        return train_df, test_df

    return train_df

print("Encoding features...")
X = encode_features(data.drop(columns=['MIS_Status']))
holdout_encoded = encode_features(holdout_data)

# ======================
# 5. FEATURE ENGINEERING
# ======================
def create_features(df):
    """Create interaction features"""
    df = df.copy()

    # Financial ratios
    if all(col in df.columns for col in ['DisbursementGross', 'GrAppv']):
        df['DisbursementRatio'] = df['DisbursementGross'] / df['GrAppv'].replace(0, 1e-6)

    if all(col in df.columns for col in ['GrAppv', 'SBA_Appv']):
        df['SBAGuaranteeRatio'] = df['SBA_Appv'] / df['GrAppv'].replace(0, 1e-6)

    # Temporal features
    if all(col in df.columns for col in ['ApprovalDate', 'DisbursementDate']):
        df['ProcessingDays'] = (df['DisbursementDate'] - df['ApprovalDate']).dt.days

    # Business features
    if 'NoEmp' in df.columns:
        df['LogEmployees'] = np.log1p(df['NoEmp'])

    return df

print("Engineering features...")
X = create_features(X)
holdout_encoded = create_features(holdout_encoded)

# ======================
# 6. TRAIN-TEST SPLIT
# ======================
print("Splitting data...")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# ======================
# 7. MODEL TRAINING (UPDATED)
# ======================
def train_model(X_train, y_train, X_val, y_val):
    """XGBoost training with categorical support"""
    param_dist = {
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 7),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 0.3),
        'reg_lambda': uniform(1, 100),
    }

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        early_stopping_rounds=20,
        tree_method='hist',
        enable_categorical=True,  # Enable categorical support
        n_estimators=300
    )

    search = RandomizedSearchCV(
        model,
        param_dist,
        n_iter=15,
        scoring='average_precision',
        cv=3,
        n_jobs=1,
        verbose=1
    )

    # Convert remaining categoricals to integer codes
    for col in X_train.select_dtypes(include=['category']).columns:
        X_train[col] = X_train[col].cat.codes
        X_val[col] = X_val[col].cat.codes

    search.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=0)
    return search.best_estimator_

print("Training model...")
model = train_model(X_train, y_train, X_val, y_val)

# ======================
# 8. EVALUATION
# ======================
def evaluate(model, X, y):
    """Comprehensive evaluation"""
    y_pred = model.predict_proba(X)[:, 1]

    # AUCPR
    aucpr = average_precision_score(y, y_pred)
    print(f"AUCPR: {aucpr:.4f}")

    # Optimal threshold
    precision, recall, thresholds = precision_recall_curve(y, y_pred)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_idx = np.argmax(f1_scores)
    threshold = thresholds[best_idx]

    # Confusion matrix
    y_class = (y_pred >= threshold).astype(int)
    cm = confusion_matrix(y, y_class)
    print(f"Threshold: {threshold:.4f}")
    print("Confusion Matrix:")
    print(cm)

    return aucpr, threshold

print("\nValidation Evaluation:")
val_aucpr, threshold = evaluate(model, X_val, y_val)
print("\nTest Evaluation:")
test_aucpr, _ = evaluate(model, X_test, y_test)

# ======================
# 9. INTERPRETATION
# ======================
def interpret(model, X_train, X_test, top_n=15):
    """Memory-efficient interpretation"""
    print("\nFeature Importance:")
    importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print(importance.head(top_n).to_markdown())

    # SHAP summary (sampled)
    try:
        import shap
        sample = X_train.sample(n=min(1000, len(X_train)), random_state=42)
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(sample)

        plt.figure(figsize=(10, 6))
        shap.summary_plot(shap_values, sample, show=False)
        plt.tight_layout()
        plt.savefig('shap_summary.png')
        plt.close()
        print("Saved SHAP summary plot to shap_summary.png")
    except Exception as e:
        print(f"Could not generate SHAP: {str(e)}")

interpret(model, X_train, X_test)

# ======================
# 10. HOLDOUT SCORING (UPDATED)
# ======================
def predict_holdout(model, holdout, threshold):
    """Safe holdout prediction with correct output format"""
    # Align columns
    missing_cols = set(X_train.columns) - set(holdout.columns)
    for col in missing_cols:
        holdout[col] = 0
    holdout = holdout[X_train.columns]

    # Predict in chunks
    chunks = []
    for i in range(0, len(holdout), 5000):
        chunk = holdout.iloc[i:i+5000]
        proba = model.predict_proba(chunk)
        chunks.append(pd.DataFrame({
            'index': chunk.index,
            'label': (proba[:, 1] >= threshold).astype(int),
            'probability_0': proba[:, 0],
            'probability_1': proba[:, 1]
        }))

    # Combine chunks and ensure correct column order
    result = pd.concat(chunks)
    result = result[['index', 'label', 'probability_0', 'probability_1']]

    return result

print("\nScoring holdout data...")
holdout_preds = predict_holdout(model, holdout_encoded, threshold)
print(holdout_preds.head())

# Save predictions
holdout_preds.to_csv('holdout_predictions.csv', index=False)
print("Predictions saved to holdout_predictions.csv")

# ======================
# KAGGLE SUBMISSION FORMAT
# ======================
def create_kaggle_submission(model, holdout, filename='submission.csv'):
    """Create submission file in Kaggle format"""
    # Align columns
    missing_cols = set(X_train.columns) - set(holdout.columns)
    for col in missing_cols:
        holdout[col] = 0
    holdout = holdout[X_train.columns]

    # Predict in chunks
    chunks = []
    for i in range(0, len(holdout), 5000):
        chunk = holdout.iloc[i:i+5000]
        proba = model.predict_proba(chunk)[:, 1]  # Only probability_1
        chunks.append(pd.DataFrame({
            'ID': chunk.index,
            'probability_1': proba
        }))

    # Combine and save
    submission = pd.concat(chunks)
    submission.to_csv(filename, index=False)
    print(f"Kaggle submission saved to {filename}")
    return submission

# Create Kaggle submission file
print("\nCreating Kaggle submission...")
kaggle_submission = create_kaggle_submission(model, holdout_encoded)
print(kaggle_submission.head())

# Save model
import joblib
joblib.dump(model, 'sba_loan_model.pkl')
print("Model saved to sba_loan_model.pkl")

Loading data...
Cleaning data...
Preparing target...
Encoding features...
Found 6 categorical features
Found 6 categorical features
Engineering features...
Splitting data...
Training model...
Fitting 3 folds for each of 15 candidates, totalling 45 fits

Validation Evaluation:
AUCPR: 0.5848
Threshold: 0.2765
Confusion Matrix:
[[86785 12117]
 [ 8234 12767]]

Test Evaluation:
AUCPR: 0.5841
Threshold: 0.2693
Confusion Matrix:
[[86015 12888]
 [ 7992 13009]]

Feature Importance:
|     | Feature           |   Importance |
|----:|:------------------|-------------:|
|   7 | UrbanRural        |    0.0758303 |
|   6 | FranchiseCode     |    0.0612762 |
| 112 | BankState_VA      |    0.0521293 |
|  68 | BankState_CA      |    0.0515097 |
| 146 | SBAGuaranteeRatio |    0.0335086 |
| 134 | RevLineCr_T       |    0.0310501 |
| 135 | RevLineCr_Y       |    0.029907  |
| 101 | BankState_OH      |    0.0297523 |
|  93 | BankState_NC      |    0.0272023 |
| 106 | BankState_RI      |    0.0228571 |
|  38 