In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer
import gc
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('processed_train.csv')
test_df = pd.read_csv('processed_test.csv')

In [3]:
train_df

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,WALLSMATERIAL_MODE_nan.1,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan,EMERGENCYSTATE_MODE_nan.1
0,100002,1.0,0,202500.0,406597.5,24700.5,351000.0,0.018801,-9461,-637,...,False,False,True,False,False,False,True,False,False,False
1,100003,0.0,0,270000.0,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,...,False,False,False,False,False,False,True,False,False,False
2,100004,0.0,0,67500.0,135000.0,6750.0,135000.0,0.010032,-19046,-225,...,False,False,False,False,True,False,False,False,True,False
3,100006,0.0,0,135000.0,312682.5,29686.5,297000.0,0.008019,-19005,-3039,...,False,False,False,False,True,False,False,False,True,False
4,100007,0.0,0,121500.0,513000.0,21865.5,513000.0,0.028663,-19932,-3038,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.0,0,157500.0,254700.0,27558.0,225000.0,0.032561,-9327,-236,...,False,False,True,False,False,False,True,False,False,False
307507,456252,0.0,0,72000.0,269550.0,12001.5,225000.0,0.025164,-20775,365243,...,False,False,True,False,False,False,True,False,False,False
307508,456253,0.0,0,153000.0,677664.0,29979.0,585000.0,0.005002,-14966,-7921,...,False,True,False,False,False,False,True,False,False,False
307509,456254,1.0,0,171000.0,370107.0,20205.0,319500.0,0.005313,-11961,-4786,...,False,False,True,False,False,False,True,False,False,False


In [4]:
test_df

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,...,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WALLSMATERIAL_MODE_nan,WALLSMATERIAL_MODE_nan.1,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,EMERGENCYSTATE_MODE_nan,EMERGENCYSTATE_MODE_nan.1
0,100001,0,135000.0,568800.0,20560.5,450000.0,0.018850,-19241,-2329,-5170.0,...,False,False,True,False,False,False,True,False,False,False
1,100005,0,99000.0,222768.0,17370.0,180000.0,0.035792,-18064,-4469,-9118.0,...,False,False,False,False,True,False,False,False,True,False
2,100013,0,202500.0,663264.0,69777.0,630000.0,0.019101,-20038,-4458,-2175.0,...,False,False,False,False,True,False,False,False,True,False
3,100028,2,315000.0,1575000.0,49018.5,1575000.0,0.026392,-13976,-1866,-2000.0,...,False,True,False,False,False,False,True,False,False,False
4,100038,1,180000.0,625500.0,32067.0,625500.0,0.010032,-13040,-2191,-4000.0,...,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,456221,0,121500.0,412560.0,17473.5,270000.0,0.002042,-19970,-5169,-9094.0,...,False,False,False,False,True,False,False,False,True,False
48740,456222,2,157500.0,622413.0,31909.5,495000.0,0.035792,-11186,-1149,-3015.0,...,False,False,False,False,True,False,False,False,True,False
48741,456223,1,202500.0,315000.0,33205.5,315000.0,0.026392,-15922,-3037,-2681.0,...,False,False,True,False,False,False,True,False,False,False
48742,456224,0,225000.0,450000.0,25128.0,450000.0,0.018850,-13968,-2731,-1461.0,...,False,True,False,False,False,False,True,False,False,False


In [5]:
train_df.shape

(307511, 386)

In [6]:
test_df.shape

(48744, 385)

### Model Training

In [7]:
train_labels = train_df['TARGET']
train_features = train_df.drop(columns=['TARGET', 'SK_ID_CURR'])

test_ids = test_df['SK_ID_CURR']
test_features = test_df.drop(columns=['SK_ID_CURR'])

In [8]:
train_features, test_features = train_features.align(test_features, join='inner', axis=1)

In [9]:
train_features.replace([np.inf, -np.inf], np.nan, inplace=True)
test_features.replace([np.inf, -np.inf], np.nan, inplace=True)

In [10]:
imputer = SimpleImputer(strategy='median')
train_features = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns)
test_features = pd.DataFrame(imputer.transform(test_features), columns=test_features.columns)

In [11]:
train_features.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train_features.columns]
test_features.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test_features.columns]

In [12]:
train_features.isnull().sum().sort_values(ascending=False)

HOUSETYPE_MODE_nan                          0
HOUSETYPE_MODE_block_of_flats               0
FONDKAPREMONT_MODE_nan_1                    0
FONDKAPREMONT_MODE_reg_oper_spec_account    0
FONDKAPREMONT_MODE_reg_oper_account         0
                                           ..
AMT_GOODS_PRICE                             0
AMT_ANNUITY                                 0
AMT_CREDIT                                  0
AMT_INCOME_TOTAL                            0
CNT_CHILDREN                                0
Length: 384, dtype: int64

### Split the training data into a training set and a validation set

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    train_features, 
    train_labels, 
    test_size=0.2, 
    random_state=42, 
    stratify=train_labels # Ensures similar target distribution in train and val sets
)

In [14]:
log_reg = LogisticRegression(C=0.001, max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

In [15]:
val_preds = log_reg.predict_proba(X_val)[:, 1]

In [18]:
auc_score = roc_auc_score(y_val, val_preds)
auc_score

np.float64(0.6495920593398545)

In [19]:
test_preds = log_reg.predict_proba(test_features)[:, 1]
test_preds

array([0.0834636 , 0.08854193, 0.06529637, ..., 0.0656988 , 0.09690306,
       0.0931015 ])