In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb


In [20]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [21]:
# Load the training and test data
train_data = pd.read_csv('/kaggle/input/credit-worthiness/Train.csv')
test_data = pd.read_csv('/kaggle/input/credit-worthiness/Test.csv')

# Preview the data to understand its structure
print(train_data.head())


      ID   Loan_ID  Gender  Married Dependents  Education  Self_Employed  \
0  74768  LP002231       1        1          0          1              0   
1  79428  LP001448       1        1          0          0              0   
2  70497  LP002231       0        0          0          0              0   
3  87480  LP001385       1        1          0          0              0   
4  33964  LP002231       1        1          1          0              0   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             8328           0.000000          17               363   
1              150        3857.458782         188               370   
2             4989         314.472511          17               348   
3              150           0.000000         232               359   
4             8059           0.000000          17               372   

   Credit_History  Property_Area  Loan_Status  Total_Income  
0               1              2            1         

In [22]:
# Identify the target and features
X = train_data.drop(columns=['Loan_Status'])
y = train_data['Loan_Status']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessing for numerical data (Imputation + Scaling)
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values
    ('scaler', StandardScaler())  # Standardize the data
])

# Preprocessing for categorical data (Imputation + OneHotEncoding)
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical variables
])

# Combine the numerical and categorical transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [23]:
# Split the data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [17]:
# Define the model pipeline with preprocessing and classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))  # Classifier
])

# Train the model
model.fit(X_train, y_train)


In [24]:
# Define the model pipeline with preprocessing and XGBoost classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing
    ('classifier', xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False))  # XGBoost classifier
])

# Train the model
model.fit(X_train, y_train)

In [27]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)
y_val_pred_prob = model.predict_proba(X_val)[:, 1]  # Probability for the positive class

# Calculate accuracy and ROC-AUC score
accuracy = accuracy_score(y_val, y_val_pred)
roc_auc = roc_auc_score(y_val, y_val_pred_prob)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation ROC-AUC: {roc_auc:.4f}")


Validation Accuracy: 0.8110
Validation ROC-AUC: 0.4588


In [30]:
# Prepare the test set and predict probabilities
X_test = test_data.drop(columns=['Loan_I'])  # Assuming 'ID' is the unique identifier column
test_pred_prob = model.predict_proba(X_test)[:, 1]  # Predict the probability of the positive class

# Prepare the submission file in the required format
submission = pd.DataFrame({
    'ID': test_data['ID'],  # Assuming 'ID' is the identifier column
    'Loan_Status': test_pred_prob  # Predicted probabilities for the positive class
})

# Save the submission file
submission.to_csv('submission.csv', index=False)


KeyError: "['Loan_ID'] not in index"