In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix


In [None]:
# -----------------------------
# 1. Load training data
# -----------------------------
train_df = pd.read_csv('/content/drive/MyDrive/Give Me Some Credit Dataset/cs-training.csv', index_col=0)


In [None]:
# -----------------------------
# 2. Preprocess training data
# -----------------------------
# Handle missing values
imputer = SimpleImputer(strategy='median')
train_imputed = pd.DataFrame(imputer.fit_transform(train_df), columns=train_df.columns)


In [None]:
# Feature engineering
train_imputed['DebtRatioPerIncome'] = train_imputed['DebtRatio'] / (train_imputed['MonthlyIncome'] + 1)
train_imputed['AgeBucket'] = pd.cut(train_imputed['age'], bins=[0, 30, 50, 100], labels=[0, 1, 2])


In [None]:
# Features and target
X = train_imputed.drop('SeriousDlqin2yrs', axis=1)
y = train_imputed['SeriousDlqin2yrs']

In [None]:
# Check for missing values after imputation
if X.isnull().any().any():
    print("There are still missing values in X. Let's impute again.")
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

There are still missing values in X. Let's impute again.


In [None]:
# Check for columns with missing values after imputation
missing_columns = X.columns[X.isnull().any()].tolist()
if missing_columns:
    print("These columns still have missing values:", missing_columns)
    # Inspect the rows with missing values in those columns
    print(X[missing_columns].isnull().sum())
else:
    print("No missing values in X.")

# If missing values are found, impute again or handle them explicitly
if X.isnull().any().any():
    print("There are still missing values in X. Let's impute again.")
    imputer = SimpleImputer(strategy='median')
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Verify again after imputation
missing_columns_after_imputation = X.columns[X.isnull().any()].tolist()
if missing_columns_after_imputation:
    print("Still missing values in the following columns after imputation:", missing_columns_after_imputation)
else:
    print("No missing values in X after re-imputation.")


No missing values in X.
No missing values in X after re-imputation.


In [None]:
# Check data types of columns in X
print("Data types of X:")
print(X.dtypes)

# Ensure all columns are numeric
non_numeric_columns = X.select_dtypes(exclude=['number']).columns.tolist()
if non_numeric_columns:
    print("These columns are non-numeric:", non_numeric_columns)
else:
    print("All columns in X are numeric.")

# If you find any non-numeric columns, we can handle them by encoding them


Data types of X:
RevolvingUtilizationOfUnsecuredLines    float64
age                                     float64
NumberOfTime30-59DaysPastDueNotWorse    float64
DebtRatio                               float64
MonthlyIncome                           float64
NumberOfOpenCreditLinesAndLoans         float64
NumberOfTimes90DaysLate                 float64
NumberRealEstateLoansOrLines            float64
NumberOfTime60-89DaysPastDueNotWorse    float64
NumberOfDependents                      float64
DebtRatioPerIncome                      float64
AgeBucket                               float64
dtype: object
All columns in X are numeric.


In [None]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:
# Split data for validation
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [None]:
# -----------------------------
# 3. Train model
# -----------------------------
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_train, y_train)


Parameters: { "use_label_encoder" } are not used.



In [None]:
# Evaluate on validation set
val_preds = model.predict_proba(X_val)[:, 1]
roc_score = roc_auc_score(y_val, val_preds)
print(f"Validation ROC AUC Score: {roc_score:.4f}")

Validation ROC AUC Score: 0.9895


In [None]:
# -----------------------------
# 4. Load and preprocess test data
# -----------------------------
test_df = pd.read_csv('/content/drive/MyDrive/Give Me Some Credit Dataset/cs-test.csv', index_col=0)

# Feature engineering for test data (same as training data)
test_df['DebtRatioPerIncome'] = test_df['DebtRatio'] / (test_df['MonthlyIncome'] + 1)
test_df['AgeBucket'] = pd.cut(test_df['age'], bins=[0, 30, 50, 100], labels=[0, 1, 2])

# Remove the target variable (SeriousDlqin2yrs) from the test data (it shouldn't be in the test set)
test_features = test_df.drop('SeriousDlqin2yrs', axis=1)

# Impute missing values in test data (using the same imputer fitted on the training data)
test_imputed = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

# Check if the feature names match the training data before proceeding
assert all(test_imputed.columns == X.columns), "Feature mismatch between training and test data!"

# Predict probabilities for the test set
test_preds = model.predict_proba(test_imputed)[:, 1]

In [None]:
# -----------------------------
# 5. Create final submission file
# -----------------------------
sample_submission = pd.read_csv('/content/drive/MyDrive/Give Me Some Credit Dataset/sampleEntry.csv')
sample_submission['Probability'] = test_preds
sample_submission.to_csv('final_submission.csv', index=False)
print("✅ Final predictions saved to 'final_submission.csv'")

✅ Final predictions saved to 'final_submission.csv'


In [None]:
# -----------------------------
# 6. Additional evaluation on validation set
# -----------------------------
y_val_pred = model.predict(X_val)

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred))

print(f"ROC AUC Score: {roc_auc_score(y_val, val_preds):.4f}")


Confusion Matrix:
[[27567   358]
 [ 1758 26307]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.94      0.99      0.96     27925
         1.0       0.99      0.94      0.96     28065

    accuracy                           0.96     55990
   macro avg       0.96      0.96      0.96     55990
weighted avg       0.96      0.96      0.96     55990

ROC AUC Score: 0.9895


In [None]:
model.save_model('xgb_model.json')
