In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import KFold, cross_val_score,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [4]:
df = pd.read_csv(r"application_record.csv")

In [5]:
df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


In [6]:
df.shape

(438557, 18)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL       

In [9]:
df['ID'].duplicated().sum()

47

In [10]:
df=df.drop_duplicates(subset='ID', keep='first')

In [11]:
df.shape

(438510, 18)

In [12]:
df.columns[1:]

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS'],
      dtype='object')

In [13]:
df.isnull().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134193
CNT_FAM_MEMBERS             0
dtype: int64

In [14]:
df['CODE_GENDER'].unique()

array(['M', 'F'], dtype=object)

In [15]:
df['FLAG_OWN_CAR'].unique()

array(['Y', 'N'], dtype=object)

In [17]:
df['FLAG_OWN_REALTY'].unique()

array(['Y', 'N'], dtype=object)

In [18]:
df['NAME_INCOME_TYPE'].unique()

array(['Working', 'Commercial associate', 'Pensioner', 'State servant',
       'Student'], dtype=object)

In [19]:
df['NAME_EDUCATION_TYPE'].unique()

array(['Higher education', 'Secondary / secondary special',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [20]:
df['NAME_FAMILY_STATUS'].unique()

array(['Civil marriage', 'Married', 'Single / not married', 'Separated',
       'Widow'], dtype=object)

In [21]:
df['NAME_HOUSING_TYPE'].unique()

array(['Rented apartment', 'House / apartment', 'Municipal apartment',
       'With parents', 'Co-op apartment', 'Office apartment'],
      dtype=object)

In [22]:
df['FLAG_MOBIL'].unique()

array([1], dtype=int64)

In [23]:
df['FLAG_WORK_PHONE'].unique()

array([1, 0], dtype=int64)

In [24]:
df['FLAG_EMAIL'].unique()

array([0, 1], dtype=int64)

In [26]:
df['OCCUPATION_TYPE'].value_counts(dropna=False)

NaN                      134193
Laborers                  78231
Core staff                43000
Sales staff               41094
Managers                  35481
Drivers                   26090
High skill tech staff     17285
Accountants               15983
Medicine staff            13518
Cooking staff              8076
Security staff             7993
Cleaning staff             5843
Private service staff      3455
Low-skill Laborers         2140
Secretaries                2044
Waiters/barmen staff       1665
Realty agents              1041
HR staff                    774
IT staff                    604
Name: OCCUPATION_TYPE, dtype: int64

In [27]:
df['OCCUPATION_TYPE'].fillna('not_specified',inplace=True)

In [28]:
df['OCCUPATION_TYPE'].value_counts(dropna=False)

not_specified            134193
Laborers                  78231
Core staff                43000
Sales staff               41094
Managers                  35481
Drivers                   26090
High skill tech staff     17285
Accountants               15983
Medicine staff            13518
Cooking staff              8076
Security staff             7993
Cleaning staff             5843
Private service staff      3455
Low-skill Laborers         2140
Secretaries                2044
Waiters/barmen staff       1665
Realty agents              1041
HR staff                    774
IT staff                    604
Name: OCCUPATION_TYPE, dtype: int64

In [29]:
df.describe(percentiles=[.01,.02,.03,.04,.05,.1,.25,.5,.75,.9,.95,.96,.97,.98,.99]).T

Unnamed: 0,count,mean,std,min,1%,2%,3%,4%,5%,10%,25%,50%,75%,90%,95%,96%,97%,98%,99%,max
ID,438510.0,6022035.0,571496.239776,5008804.0,5024429.09,5041533.18,5054216.27,5068686.36,5091807.45,5181098.9,5609362.25,6047719.5,6454160.75,6722537.1,6800362.55,6830288.64,7009897.1,7271099.44,7636888.19,7999952.0
CNT_CHILDREN,438510.0,0.4273814,0.724874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,19.0
AMT_INCOME_TOTAL,438510.0,187525.4,110089.279583,26100.0,54000.0,67500.0,67500.0,72000.0,76500.0,90000.0,121500.0,160940.25,225000.0,315000.0,360000.0,382500.0,405000.0,450000.0,540000.0,6750000.0
DAYS_BIRTH,438510.0,-15998.02,4185.016222,-25201.0,-24013.0,-23642.0,-23413.0,-23181.0,-22972.0,-21982.0,-19484.0,-15630.0,-12514.0,-10519.0,-9889.0,-9655.0,-9343.0,-9011.0,-8575.0,-7489.0
DAYS_EMPLOYED,438510.0,60566.19,138770.072835,-17531.0,-10913.0,-9533.0,-8486.0,-7792.0,-7205.0,-5295.0,-3103.0,-1467.0,-371.0,365243.0,365243.0,365243.0,365243.0,365243.0,365243.0,365243.0
FLAG_MOBIL,438510.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_WORK_PHONE,438510.0,0.2061276,0.404524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_PHONE,438510.0,0.2877699,0.452724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_EMAIL,438510.0,0.1082005,0.310634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CNT_FAM_MEMBERS,438510.0,2.194463,0.897192,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,20.0


In [30]:
df[df['DAYS_EMPLOYED']>=0]['DAYS_EMPLOYED'].value_counts()

365243    75324
Name: DAYS_EMPLOYED, dtype: int64

In [31]:
df['DAYS_EMPLOYED'].replace(365243,0,inplace=True)

In [32]:
df[df['DAYS_EMPLOYED']>=0]['DAYS_EMPLOYED'].value_counts()

0    75324
Name: DAYS_EMPLOYED, dtype: int64

In [34]:
df['AGE_YEARS']=round(-df['DAYS_BIRTH']/365.2425,0)

In [35]:
df['YEARS_EMPLOYED']=round(-df['DAYS_EMPLOYED']/365.2425)
df.loc[df['YEARS_EMPLOYED']<0,'YEARS_EMPLOYED']=0

In [36]:
df.drop(columns=["DAYS_BIRTH","DAYS_EMPLOYED"],inplace=True)

In [37]:
df.describe(percentiles=[.01,.02,.03,.04,.05,.1,.25,.5,.75,.9,.95,.96,.97,.98,.99]).T

Unnamed: 0,count,mean,std,min,1%,2%,3%,4%,5%,10%,25%,50%,75%,90%,95%,96%,97%,98%,99%,max
ID,438510.0,6022035.0,571496.239776,5008804.0,5024429.09,5041533.18,5054216.27,5068686.36,5091807.45,5181098.9,5609362.25,6047719.5,6454160.75,6722537.1,6800362.55,6830288.64,7009897.1,7271099.44,7636888.19,7999952.0
CNT_CHILDREN,438510.0,0.4273814,0.724874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,3.0,19.0
AMT_INCOME_TOTAL,438510.0,187525.4,110089.279583,26100.0,54000.0,67500.0,67500.0,72000.0,76500.0,90000.0,121500.0,160940.25,225000.0,315000.0,360000.0,382500.0,405000.0,450000.0,540000.0,6750000.0
FLAG_MOBIL,438510.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_WORK_PHONE,438510.0,0.2061276,0.404524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_PHONE,438510.0,0.2877699,0.452724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FLAG_EMAIL,438510.0,0.1082005,0.310634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
CNT_FAM_MEMBERS,438510.0,2.194463,0.897192,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,5.0,20.0
AGE_YEARS,438510.0,43.80453,11.465521,21.0,23.0,25.0,26.0,26.0,27.0,29.0,34.0,43.0,53.0,60.0,63.0,63.0,64.0,65.0,66.0,69.0
YEARS_EMPLOYED,438510.0,5.956069,6.574244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0,8.0,14.0,20.0,21.0,23.0,26.0,30.0,48.0


In [38]:
df['ID'].duplicated().sum()

0

In [40]:
df[df['AMT_INCOME_TOTAL']>54000]

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,AGE_YEARS,YEARS_EMPLOYED
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,not_specified,2.0,33.0,12.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,1,1,0,0,not_specified,2.0,33.0,12.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,1,0,0,0,Security staff,2.0,59.0,3.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52.0,8.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,1,0,1,1,Sales staff,1.0,52.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438551,6840102,F,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,1,0,0,0,not_specified,1.0,62.0,0.0
438552,6840104,M,N,Y,0,135000.0,Pensioner,Secondary / secondary special,Separated,House / apartment,1,0,0,0,not_specified,1.0,62.0,0.0
438553,6840222,F,N,N,0,103500.0,Working,Secondary / secondary special,Single / not married,House / apartment,1,0,0,0,Laborers,1.0,44.0,8.0
438555,6842765,F,N,Y,0,72000.0,Pensioner,Secondary / secondary special,Married,House / apartment,1,0,0,0,not_specified,2.0,59.0,0.0


In [44]:
df.drop(columns=["FLAG_MOBIL"],inplace=True)

In [45]:
columns_to_scale = ['CNT_CHILDREN','AMT_INCOME_TOTAL', 'AGE_YEARS','YEARS_EMPLOYED', 'CNT_FAM_MEMBERS']

In [46]:
st=StandardScaler()
df[columns_to_scale] = st.fit_transform(df[columns_to_scale]) 

In [49]:
columns_to_encode = ['CODE_GENDER','FLAG_OWN_CAR','NAME_EDUCATION_TYPE', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE']

In [96]:
df=pd.get_dummies(df,columns=columns_to_encode,dtype='int')

KeyError: "None of [Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_EDUCATION_TYPE', 'FLAG_OWN_REALTY',\n       'NAME_INCOME_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',\n       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE'],\n      dtype='object')] are in the [columns]"

In [51]:
df

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,AGE_YEARS,YEARS_EMPLOYED,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,...,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_not_specified
0,5008804,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
1,5008805,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
2,5008806,-0.589595,-0.681497,-0.216747,1.325320,-0.449645,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,5008808,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5008809,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438552,6840104,-0.589595,-0.477117,-1.331336,1.586974,-0.905971,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
438553,6840222,-0.589595,-0.763249,-1.331336,0.017048,0.310900,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
438554,6841878,-0.589595,-1.212884,-1.331336,-1.901750,-0.753862,1,0,1,0,...,0,0,0,0,0,1,0,0,0,0
438555,6842765,-0.589595,-1.049381,-0.216747,1.325320,-0.905971,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [52]:
df1 = pd.read_csv(r"credit_record.csv")

In [53]:
df1.head()

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [54]:
df1.shape

(1048575, 3)

In [55]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


In [56]:
df1.duplicated().sum()

0

In [57]:
df1['MONTHS_BALANCE'].unique()

array([  0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9, -10, -11, -12,
       -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25,
       -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38,
       -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51,
       -52, -53, -54, -55, -56, -57, -58, -59, -60], dtype=int64)

In [58]:
df1['STATUS'].unique()

array(['X', '0', 'C', '1', '2', '3', '4', '5'], dtype=object)

In [60]:
df1[df1['STATUS'].isin(['X','C'])]

Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
4,5001712,0,C
5,5001712,-1,C
6,5001712,-2,C
7,5001712,-3,C
...,...,...,...
1048570,5150487,-25,C
1048571,5150487,-26,C
1048572,5150487,-27,C
1048573,5150487,-28,C


In [61]:
df1['ID'].nunique()

45985

In [62]:
df1['target']=df1['STATUS']
df1['target'].replace('X', 0, inplace=True)
df1['target'].replace('C', 0, inplace=True)
df1['target']=df1['target'].astype(int)
df1.loc[df1['target']>=1,'target']=1

In [63]:
df2=pd.DataFrame(df1.groupby(['ID'])['target'].agg("max")).reset_index()

In [64]:
df2.sample(10)

Unnamed: 0,ID,target
38351,5120972,0
9025,5022271,0
30183,5094929,0
17011,5047745,0
35397,5115936,0
34690,5114298,0
8796,5021994,0
20767,5060320,1
23258,5066610,0
23516,5066973,0


In [65]:
df2["target"].value_counts()

0    40635
1     5350
Name: target, dtype: int64

In [66]:
new_df=pd.merge(df, df2, how='inner', on=['ID'])

In [67]:
new_df

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,AGE_YEARS,YEARS_EMPLOYED,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,...,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_not_specified,target
0,5008804,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,0,1,1
1,5008805,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,0,1,1
2,5008806,-0.589595,-0.681497,-0.216747,1.325320,-0.449645,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0
3,5008808,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,5008809,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,-0.589595,1.157921,-0.216747,0.278703,0.158791,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
36453,5149834,-0.589595,-0.272737,-0.216747,-0.855133,-0.297536,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
36454,5149838,-0.589595,-0.272737,-0.216747,-0.855133,-0.297536,1,0,1,0,...,0,1,0,0,0,0,0,0,0,1
36455,5150049,-0.589595,0.871790,-0.216747,0.453139,-0.601753,1,0,1,0,...,0,0,0,0,1,0,0,0,0,1


In [76]:
# Extract how many months account has been open for
start_df=pd.DataFrame(df1.groupby(['ID'])['MONTHS_BALANCE'].agg(min)).reset_index()
start_df.rename(columns={'MONTHS_BALANCE':'ACCOUNT_LENGTH'}, inplace=True)
start_df['ACCOUNT_LENGTH']=-start_df['ACCOUNT_LENGTH']

In [75]:
start_df

Unnamed: 0,ID,ACCOUNT_LENGTH
0,5001711,3
1,5001712,18
2,5001713,21
3,5001714,14
4,5001715,59
...,...,...
45980,5150482,28
45981,5150483,17
45982,5150484,12
45983,5150485,1


In [77]:
new_df=pd.merge(new_df, start_df, how='inner', on=['ID'])

In [78]:
new_df

Unnamed: 0,ID,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,AGE_YEARS,YEARS_EMPLOYED,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,...,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,OCCUPATION_TYPE_not_specified,target,ACCOUNT_LENGTH
0,5008804,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,1,1,15
1,5008805,-0.589595,2.179820,-0.216747,-0.942351,0.919336,0,1,0,1,...,0,0,0,0,0,0,0,1,1,14
2,5008806,-0.589595,-0.681497,-0.216747,1.325320,-0.449645,0,1,0,1,...,0,0,0,0,0,1,0,0,0,29
3,5008808,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,1,0,0,0,0,0,4
4,5008809,-0.589595,0.749162,-1.331336,0.714793,0.310900,1,0,1,0,...,0,0,0,1,0,0,0,0,0,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36452,5149828,-0.589595,1.157921,-0.216747,0.278703,0.158791,0,1,0,1,...,0,0,0,0,0,0,0,0,1,11
36453,5149834,-0.589595,-0.272737,-0.216747,-0.855133,-0.297536,1,0,1,0,...,1,0,0,0,0,0,0,0,1,23
36454,5149838,-0.589595,-0.272737,-0.216747,-0.855133,-0.297536,1,0,1,0,...,1,0,0,0,0,0,0,0,1,32
36455,5150049,-0.589595,0.871790,-0.216747,0.453139,-0.601753,1,0,1,0,...,0,0,0,1,0,0,0,0,1,9


In [80]:
new_df.columns

Index(['ID', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'CNT_FAM_MEMBERS',
       'AGE_YEARS', 'YEARS_EMPLOYED', 'CODE_GENDER_F', 'CODE_GENDER_M',
       'FLAG_OWN_CAR_N', 'FLAG_OWN_CAR_Y',
       'NAME_EDUCATION_TYPE_Academic degree',
       'NAME_EDUCATION_TYPE_Higher education',
       'NAME_EDUCATION_TYPE_Incomplete higher',
       'NAME_EDUCATION_TYPE_Lower secondary',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'FLAG_OWN_REALTY_N', 'FLAG_OWN_REALTY_Y',
       'NAME_INCOME_TYPE_Commercial associate', 'NAME_INCOME_TYPE_Pensioner',
       'NAME_INCOME_TYPE_State servant', 'NAME_INCOME_TYPE_Student',
       'NAME_INCOME_TYPE_Working', 'NAME_FAMILY_STATUS_Civil marriage',
       'NAME_FAMILY_STATUS_Married', 'NAME_FAMILY_STATUS_Separated',
       'NAME_FAMILY_STATUS_Single / not married', 'NAME_FAMILY_STATUS_Widow',
       'NAME_HOUSING_TYPE_Co-op apartment',
       'NAME_HOUSING_TYPE_House / apartment',
       'NAME_HOUSING_TYPE_Municipal apartment',
       'NAME_HOUS

In [81]:
new_df.drop(columns=["ID"],inplace=True)

In [82]:
x = new_df.drop(columns=['target'])
y = new_df['target']

In [84]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.25,stratify=y,random_state=0)

## Applying Classifier Models

In [85]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [86]:
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'Random Forest': RandomForestClassifier(random_state=0),
    'KNN': KNeighborsClassifier()
}

In [95]:
# Evaluate each classifier
for clf_name, clf in classifiers.items():
    print(f"\n\033[1m{clf_name} Model:\033[0m")  # \033[1m for bold, \033[0m to reset
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print(f"Model Score: {score:.4f}")
    test_preds = clf.predict(x_test)
    print("\nClassification Report:\n", classification_report(y_test, test_preds))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, test_preds))
    accuracy = accuracy_score(y_test, test_preds)
    print(f"\nTesting Accuracy: {accuracy:.4f}")


Logistic Regression Model:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Score: 0.8823


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94      8042
           1       0.00      0.00      0.00      1073

    accuracy                           0.88      9115
   macro avg       0.44      0.50      0.47      9115
weighted avg       0.78      0.88      0.83      9115


Confusion Matrix:
 [[8042    0]
 [1073    0]]

Testing Accuracy: 0.8823

Naive Bayes Model:
Model Score: 0.8081

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.89      8042
           1       0.15      0.14      0.15      1073

    accuracy                           0.81      9115
   macro avg       0.52      0.52      0.52      9115
weighted avg       0.80      0.81      0.80      9115


Confusion Matrix:
 [[7217  825]
 [ 924  149]]

Testing Accuracy: 0.8081

Decision Tree Model:
Model Score: 0.8366

Classification Report:
               precision    recall  f1-score