In [3]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.ensemble import VotingClassifier

In [None]:
application_train = pd.read_csv(r'C:\Users\sanja\OneDrive\Desktop\CreditRiskAnalysis\datasets\application_train.csv')
application_test = pd.read_csv(r'C:\Users\sanja\OneDrive\Desktop\CreditRiskAnalysis\datasets\application_test.csv')

print("Training set shape:", application_train.shape)
print("Testing set shape:", application_test.shape)

application_train.head()

Training set shape: (307511, 122)
Testing set shape: (48744, 121)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
application_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()


numeric_columns = application_train.select_dtypes(include=['number']).columns.drop('TARGET')

application_train_imputed = pd.DataFrame(imputer.fit_transform(application_train[numeric_columns]),
                                         columns=numeric_columns)
application_test_imputed = pd.DataFrame(imputer.transform(application_test[numeric_columns]),
                                        columns=numeric_columns)
scaled_train = scaler.fit_transform(application_train_imputed)

X = scaled_train
y = application_train['TARGET']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

random_forest = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
xgboost = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
logistic_regression = LogisticRegression(max_iter=500, random_state=42, class_weight='balanced')

In [None]:
ensemble = VotingClassifier(
    estimators=[
        ('rf', random_forest),
        ('xgb', xgboost),
        ('lr', logistic_regression)
    ],
    voting='soft'  
)

ensemble.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [16]:
# Predict probabilities
y_val_pred_prob = ensemble.predict_proba(X_val)[:, 1]

# Metrics
roc_auc = roc_auc_score(y_val, y_val_pred_prob)
print("ROC AUC Score:", roc_auc)
print(classification_report(y_val, ensemble.predict(X_val)))

ROC AUC Score: 0.7480530287526682
              precision    recall  f1-score   support

           0       0.92      0.99      0.96     56554
           1       0.47      0.07      0.12      4949

    accuracy                           0.92     61503
   macro avg       0.70      0.53      0.54     61503
weighted avg       0.89      0.92      0.89     61503



In [17]:
scaled_test = scaler.transform(application_test_imputed)
test_predictions = ensemble.predict_proba(scaled_test)[:, 1]

submission = pd.DataFrame({
    'SK_ID_CURR': application_test['SK_ID_CURR'],
    'TARGET': test_predictions
})

submission.to_csv('submission.csv', index=False)
print("Ensemble submission file created.")

Ensemble submission file created.


In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler

with open('ensemble_model.pkl', 'wb') as model_file:
    pickle.dump(ensemble, model_file)


with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Ensemble model and scaler saved successfully!")


Ensemble model and scaler saved successfully!


In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  

df_train = pd.DataFrame({
    'SK_ID_CURR': [100002, 100003, 100004, 100006, 100007],
    'TARGET': [1, 0, 0, 0, 0],
    'NAME_CONTRACT_TYPE': ['Cash loans', 'Cash loans', 'Revolving loans', 'Cash loans', 'Cash loans'],
    'CODE_GENDER': ['M', 'F', 'M', 'F', 'M'],
    'FLAG_OWN_CAR': ['Y', 'N', 'Y', 'N', 'N'],
    'FLAG_OWN_REALTY': ['Y', 'N', 'Y', 'Y', 'Y'],
    'CNT_CHILDREN': [0, 0, 0, 0, 0],
    'AMT_INCOME_TOTAL': [202500.0, 270000.0, 67500.0, 135000.0, 121500.0],
    'AMT_CREDIT': [406597.5, 1293502.5, 135000.0, 312682.5, 513000.0],
    'AMT_ANNUITY': [24700.5, 35698.5, 6750.0, 29686.5, 21865.5]
})


df_encoded = pd.get_dummies(df_train, drop_first=True)


df_encoded = df_encoded.fillna(df_encoded.mean())


scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_encoded.drop(columns='TARGET'))  

X = scaled_data
y = df_train['TARGET']
model = RandomForestClassifier()
model.fit(X, y)

with open('ensemble_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler saved successfully!")


Model and scaler saved successfully!


In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler

with open('ensemble_model.pkl', 'rb') as model_file:
    ensemble_model = pickle.load(model_file)

with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)
    
own_data = {
    'SK_ID_CURR': [100002, 100003, 100004, 100006, 100007],
    'TARGET': [1, 0, 0, 0, 0],  # This column is just for reference and won't be used in prediction
    'NAME_CONTRACT_TYPE': ['Cash loans', 'Cash loans', 'Revolving loans', 'Cash loans', 'Cash loans'],
    'CODE_GENDER': ['M', 'F', 'M', 'F', 'M'],
    'FLAG_OWN_CAR': ['Y', 'N', 'Y', 'N', 'N'],
    'FLAG_OWN_REALTY': ['Y', 'N', 'Y', 'Y', 'Y'],
    'CNT_CHILDREN': [0, 0, 0, 0, 0],
    'AMT_INCOME_TOTAL': [202500.0, 270000.0, 67500.0, 135000.0, 121500.0],
    'AMT_CREDIT': [406597.5, 1293502.5, 135000.0, 312682.5, 513000.0],
    'AMT_ANNUITY': [24700.5, 35698.5, 6750.0, 29686.5, 21865.5],
   
}


df = pd.DataFrame(own_data)

df_encoded = pd.get_dummies(df, drop_first=True)

df_encoded = df_encoded.fillna(df_encoded.mean())


scaled_data = scaler.transform(df_encoded.drop(columns='TARGET'))  

predictions = ensemble_model.predict_proba(scaled_data)[:, 1] 
df['PREDICTION'] = predictions

print("Predictions for the provided data:")
print(df[['SK_ID_CURR', 'PREDICTION']])


Predictions for the provided data:
   SK_ID_CURR  PREDICTION
0      100002        0.66
1      100003        0.17
2      100004        0.14
3      100006        0.09
4      100007        0.11
