In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train = df_train.drop('Id', axis = 1)
df_test = df_test.drop('Id', axis = 1)

In [4]:
df_train.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 5943 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [6]:
df_train.isna().sum()

Home Ownership                     0
Annual Income                   1557
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [7]:
df_train['Annual Income'] = df_train['Annual Income'].fillna(df_train['Annual Income'].median())
df_test['Annual Income'] = df_test['Annual Income'].fillna(df_test['Annual Income'].median())

In [8]:
df_train.isna().sum()

Home Ownership                     0
Annual Income                      0
Years in current job             371
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent    4081
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7129 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [10]:
df_train['Years in current job'].isna()

0        True
1       False
2       False
3       False
4       False
        ...  
7495    False
7496    False
7497    False
7498     True
7499    False
Name: Years in current job, Length: 7500, dtype: bool

In [11]:
df_train['Years in current job']

0             NaN
1       10+ years
2         8 years
3         6 years
4         8 years
          ...    
7495     < 1 year
7496       1 year
7497      6 years
7498          NaN
7499      4 years
Name: Years in current job, Length: 7500, dtype: object

In [12]:
years = list(df_train['Years in current job'].unique()[1:])

In [13]:
df_train['Years in current job'] = df_train['Years in current job'].apply(lambda x:np.random.choice(years) if type(x) == float else x)
df_test['Years in current job'] = df_test['Years in current job'].apply(lambda x:np.random.choice(years) if type(x) == float else x)

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [15]:
label = LabelEncoder()

In [16]:
df_train['Years in current job'] = label.fit_transform(df_train['Years in current job'])
df_test['Years in current job'] = label.transform(df_test['Years in current job'])

In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int64  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  3419 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [18]:
df_train['Months since last delinquent'] = df_train['Months since last delinquent'].fillna(df_train['Months since last delinquent'].median())
df_test['Months since last delinquent'] = df_test['Months since last delinquent'].fillna(df_test['Months since last delinquent'].median())

In [19]:
df_train.isna().sum()

Home Ownership                     0
Annual Income                      0
Years in current job               0
Tax Liens                          0
Number of Open Accounts            0
Years of Credit History            0
Maximum Open Credit                0
Number of Credit Problems          0
Months since last delinquent       0
Bankruptcies                      14
Purpose                            0
Term                               0
Current Loan Amount                0
Current Credit Balance             0
Monthly Debt                       0
Credit Score                    1557
Credit Default                     0
dtype: int64

In [20]:
df_train['Credit Score'] = df_train['Credit Score'].fillna(df_train['Credit Score'].median())
df_test['Credit Score'] = df_test['Credit Score'].fillna(df_test['Credit Score'].median())

In [21]:
df_train.isna().sum()

Home Ownership                   0
Annual Income                    0
Years in current job             0
Tax Liens                        0
Number of Open Accounts          0
Years of Credit History          0
Maximum Open Credit              0
Number of Credit Problems        0
Months since last delinquent     0
Bankruptcies                    14
Purpose                          0
Term                             0
Current Loan Amount              0
Current Credit Balance           0
Monthly Debt                     0
Credit Score                     0
Credit Default                   0
dtype: int64

In [22]:
df_train['Credit Score'].value_counts()

Credit Score
731.0     1651
740.0      169
747.0      168
748.0      157
745.0      152
          ... 
6750.0       1
6880.0       1
6770.0       1
6410.0       1
6060.0       1
Name: count, Length: 268, dtype: int64

In [23]:
df_train['Bankruptcies'].value_counts()

Bankruptcies
0.0    6660
1.0     786
2.0      31
3.0       7
4.0       2
Name: count, dtype: int64

In [24]:
df_train['Bankruptcies'] = df_train['Bankruptcies'].fillna(0.0)
df_test['Bankruptcies'] = df_test['Bankruptcies'].fillna(0.0)

In [25]:
df_train.isna().sum()

Home Ownership                  0
Annual Income                   0
Years in current job            0
Tax Liens                       0
Number of Open Accounts         0
Years of Credit History         0
Maximum Open Credit             0
Number of Credit Problems       0
Months since last delinquent    0
Bankruptcies                    0
Purpose                         0
Term                            0
Current Loan Amount             0
Current Credit Balance          0
Monthly Debt                    0
Credit Score                    0
Credit Default                  0
dtype: int64

In [26]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int64  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [27]:
df_train['Home Ownership'].value_counts()

Home Ownership
Home Mortgage    3637
Rent             3204
Own Home          647
Have Mortgage      12
Name: count, dtype: int64

In [28]:
df_train['Home Ownership'] = df_train['Home Ownership'].replace(to_replace = 'Have Mortgage', value = 'Have Mortgage' )
df_test['Home Ownership'] = df_test['Home Ownership'].replace(to_replace = 'Have Mortgage', value = 'Have Mortgage' )

In [29]:
df_train['Home Ownership'] = label.fit_transform(df_train['Home Ownership'])
df_test['Home Ownership'] = label.transform(df_test['Home Ownership'])


In [30]:
df_train['Term'] = label.fit_transform(df_train['Term'])
df_test['Term'] = label.transform(df_test['Term'])

In [31]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   int64  
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   int64  
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7500 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   int64  
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [2]:
df_train.head()

NameError: name 'df_train' is not defined

In [32]:
df_train['Purpose'] = label.fit_transform(df_train['Purpose'])
df_test['Purpose'] = label.transform(df_test['Purpose'])


In [33]:
X_train = df_train.drop('Credit Default', axis = 1)
y_train = df_train['Credit Default']
x_test = df_test.copy()

In [34]:
model = RandomForestClassifier()

In [75]:
model_params = {
    'n_estimators':[60,70,80,90],
    'criterion':['gini','entropy','log_loss'],
    'max_depth':[None,1,3,5,7],
    'max_features':[None,'sqrt','log2']
}

In [76]:
final_model = GridSearchCV(model,param_grid = model_params, cv = 3, verbose = 2)

In [77]:
final_model.fit(X_train,y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=   2.7s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=   2.7s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=60; total time=   2.7s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=   3.6s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=   3.2s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=70; total time=   3.2s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=   3.7s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=   3.6s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=80; total time=   3.8s
[CV] END criterion=gini, max_depth=None, max_features=None, n_estimators=

In [78]:
best_model = final_model.best_estimator_

In [79]:
y_pred = best_model.predict(x_test)

In [80]:
submission = pd.read_csv('sampleSubmission.csv')

In [81]:
submission['Credit Default'] = y_pred

In [82]:
submission.to_csv('submission.csv', index=False, index_label=False)