In [10]:
# For Preprocessing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from xgboost import plot_importance

In [11]:
df_train = pd.read_csv("application_train.csv")
df_test = pd.read_csv("application_test.csv")

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in df_train:
    if df_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(df_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(df_train[col])
            # Transform both training and testing data
            df_train[col] = le.transform(df_train[col])
            df_test[col] = le.transform(df_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [15]:
# one-hot encoding of categorical variables
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)
list1=df_train.columns

print('Training Features shape: ', df_train.shape)
print('Testing Features shape: ', df_test.shape)
print(list1)

Training Features shape:  (307511, 243)
Testing Features shape:  (48744, 239)
Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'AMT_GOODS_PRICE',
       ...
       'HOUSETYPE_MODE_terraced house', 'WALLSMATERIAL_MODE_Block',
       'WALLSMATERIAL_MODE_Mixed', 'WALLSMATERIAL_MODE_Monolithic',
       'WALLSMATERIAL_MODE_Others', 'WALLSMATERIAL_MODE_Panel',
       'WALLSMATERIAL_MODE_Stone, brick', 'WALLSMATERIAL_MODE_Wooden',
       'EMERGENCYSTATE_MODE_No', 'EMERGENCYSTATE_MODE_Yes'],
      dtype='object', length=243)


In [16]:

# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [17]:
# Missing values statistics
missing_values = missing_values_table(df_train)
missing_values.index[:45]

Your selected dataframe has 243 columns.
There are 61 columns that have missing values.


Index(['COMMONAREA_MODE', 'COMMONAREA_MEDI', 'COMMONAREA_AVG',
       'NONLIVINGAPARTMENTS_MODE', 'NONLIVINGAPARTMENTS_AVG',
       'NONLIVINGAPARTMENTS_MEDI', 'LIVINGAPARTMENTS_MEDI',
       'LIVINGAPARTMENTS_AVG', 'LIVINGAPARTMENTS_MODE', 'FLOORSMIN_MEDI',
       'FLOORSMIN_MODE', 'FLOORSMIN_AVG', 'YEARS_BUILD_AVG',
       'YEARS_BUILD_MODE', 'YEARS_BUILD_MEDI', 'OWN_CAR_AGE', 'LANDAREA_MEDI',
       'LANDAREA_MODE', 'LANDAREA_AVG', 'BASEMENTAREA_MODE',
       'BASEMENTAREA_AVG', 'BASEMENTAREA_MEDI', 'EXT_SOURCE_1',
       'NONLIVINGAREA_MODE', 'NONLIVINGAREA_AVG', 'NONLIVINGAREA_MEDI',
       'ELEVATORS_MEDI', 'ELEVATORS_MODE', 'ELEVATORS_AVG', 'APARTMENTS_AVG',
       'APARTMENTS_MEDI', 'APARTMENTS_MODE', 'ENTRANCES_AVG', 'ENTRANCES_MEDI',
       'ENTRANCES_MODE', 'LIVINGAREA_MEDI', 'LIVINGAREA_AVG',
       'LIVINGAREA_MODE', 'FLOORSMAX_MODE', 'FLOORSMAX_MEDI', 'FLOORSMAX_AVG',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BEGINEXPLUATATION_AVG',
       'YEARS_BEGINEXPLUATATION_MOD

In [22]:
from sklearn.model_selection import train_test_split
newX=df_train.drop(missing_values.index[:15],axis=1)
newX.fillna(-1000, inplace=True)
X_train, X_test, y_train, y_test = train_test_split( newX.drop('TARGET',axis=1) , df_train['TARGET'] , test_size=0.2, random_state=42)


In [23]:
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)
print("Logistic Regr. Score = ", clf1.score(X_test, y_test))



Logistic Regr. Score =  0.9194998617953596


In [24]:
from sklearn.metrics import roc_auc_score
y_train_predicted = clf1.predict_proba(X_train)[:,1]
y_test_predicted = clf1.predict_proba(X_test)[:,1]
print("Train AUC %.4f"%roc_auc_score(y_train,y_train_predicted))
print("Test AUC %.4f"%roc_auc_score(y_test,y_test_predicted))

Train AUC 0.6286
Test AUC 0.6317


In [27]:
y_train_pred = clf1.predict(X_train)
y_test_pred = clf1.predict(X_test)
print("Train Accuracy %.4f" % accuracy_score(y_train,y_train_pred))
print("Test Accuracy %.4f" % accuracy_score(y_test,y_test_pred))

Train Accuracy 0.9192
Test Accuracy 0.9195


In [28]:
from sklearn.metrics import confusion_matrix

confmat=confusion_matrix(y_test,y_test_pred)
print(confmat)
# plt.imshow(confmat, cmap='binary')

[[56552     2]
 [ 4949     0]]


In [30]:
clf2 = XGBClassifier()
clf2.fit(X_train, y_train)
print("XGBoost Score = ", clf2.score(X_test, y_test))

XGBoost Score =  0.919662455489976


In [31]:
from sklearn.metrics import roc_auc_score
y_train_predicted = clf2.predict_proba(X_train)[:,1]
y_test_predicted = clf2.predict_proba(X_test)[:,1]
print("Train AUC %.4f"%roc_auc_score(y_train,y_train_predicted))
print("Test AUC %.4f"%roc_auc_score(y_test,y_test_predicted))

Train AUC 0.7581
Test AUC 0.7504


In [32]:
y_train_pred = clf2.predict(X_train)
y_test_pred = clf2.predict(X_test)
print("Train Accuracy %.4f" % accuracy_score(y_train,y_train_pred))
print("Test Accuracy %.4f" % accuracy_score(y_test,y_test_pred))

Train Accuracy 0.9194
Test Accuracy 0.9197


In [33]:
from sklearn.metrics import confusion_matrix

confmat=confusion_matrix(y_test,y_test_pred)
print(confmat)
# plt.imshow(confmat, cmap='binary')

[[56543    11]
 [ 4930    19]]


In [34]:
clf3 = KNeighborsClassifier()
clf3.fit(X_train, y_train)
print("KNN Score = ", clf3.score(X_test, y_test))

KNN Score =  0.9139879355478595


In [35]:
from sklearn.metrics import roc_auc_score
y_train_predicted = clf3.predict_proba(X_train)[:,1]
y_test_predicted = clf3.predict_proba(X_test)[:,1]
print("Train AUC %.4f"%roc_auc_score(y_train,y_train_predicted))
print("Test AUC %.4f"%roc_auc_score(y_test,y_test_predicted))

Train AUC 0.8880
Test AUC 0.5387


In [36]:
y_train_pred = clf3.predict(X_train)
y_test_pred = clf3.predict(X_test)
print("Train Accuracy %.4f" % accuracy_score(y_train,y_train_pred))
print("Test Accuracy %.4f" % accuracy_score(y_test,y_test_pred))

Train Accuracy 0.9208
Test Accuracy 0.9140


In [37]:
from sklearn.metrics import confusion_matrix

confmat=confusion_matrix(y_test,y_test_pred)
print(confmat)
# plt.imshow(confmat, cmap='binary')

[[56153   401]
 [ 4889    60]]


In [38]:
clf4 = RandomForestClassifier()
clf4.fit(X_train, y_train)
print("Random Forest Score = ", clf4.score(X_test, y_test))



Random Forest Score =  0.9186543745833536


In [39]:
from sklearn.metrics import roc_auc_score
y_train_predicted = clf4.predict_proba(X_train)[:,1]
y_test_predicted = clf4.predict_proba(X_test)[:,1]
print("Train AUC %.4f"%roc_auc_score(y_train,y_train_predicted))
print("Test AUC %.4f"%roc_auc_score(y_test,y_test_predicted))

Train AUC 0.9998
Test AUC 0.6184


In [40]:
y_train_pred = clf3.predict(X_train)
y_test_pred = clf3.predict(X_test)
print("Train Accuracy %.4f" % accuracy_score(y_train,y_train_pred))
print("Test Accuracy %.4f" % accuracy_score(y_test,y_test_pred))

Train Accuracy 0.9208
Test Accuracy 0.9140


In [41]:
from sklearn.metrics import confusion_matrix

confmat=confusion_matrix(y_test,y_test_pred)
print(confmat)
# plt.imshow(confmat, cmap='binary')

[[56153   401]
 [ 4889    60]]
