### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report

### Load data

In [None]:
train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")

### Explore data

In [None]:
print('Training set shape:', train.shape)


In [None]:
print('Test set shape:    ', test.shape)

In [None]:
train.head(20)

In [None]:
test.head(20)

### Check missing data 

In [None]:
total = train.isnull().sum().sort_values(ascending = False)
percent = (train.isnull().sum()/train.isnull().count()*100).sort_values(ascending = False)
miss_train = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

miss_train

### Determine the number of columns that do not have missing values.

In [None]:
No_miss_train = miss_train.loc[miss_train.Total != 0, :]
print(len(No_miss_train))

### Display a histogram of the percentages of missing values across all DataFrames.

In [None]:
plt.hist(miss_train['Percent'], bins = np.arange(0, 110, 10),
         edgecolor='k', color='cornflowerblue')
plt.show()

### Check number of unique classes in each object column

In [None]:
train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

### Drop lable and ID

In [None]:
train_drop = train.drop(labels=['SK_ID_CURR', 'TARGET'], axis=1)

### Check distribution of label

In [None]:
plt.hist(train['TARGET'], edgecolor='k', color='cornflowerblue')
plt.show()

### Preprocessing

In [None]:
cat_features = ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE',
               'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 
                'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
               'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
num_features = [i for i in train_drop.columns if i not in cat_features]
features = num_features + cat_features

num_transformer = Pipeline(
        steps = [
            ('impeter', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler())
        ]
)

cat_transformer = Pipeline(
    steps = [
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer(
     transformers = [
         ('num', num_transformer, num_features),
         ('cat', cat_transformer, cat_features)
     ]
)




In [None]:
preprocessor.fit(train[features])
X_train = preprocessor.transform(train[features])
X_test = preprocessor.transform(test[features])

In [None]:
y_train = train.TARGET.values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)

### Logistic Regression¶

In [None]:
lr_mod = LogisticRegression(solver='liblinear')
lr_mod.fit(X_train, y_train)
print(lr_mod.score(X_train, y_train))

### Scoring with Cross-Validation

In [None]:
%%time

lr_cv_results = cross_val_score(lr_mod, X_train, y_train, cv=10, scoring='roc_auc')

print('Validation AUC by fold: ', lr_cv_results)
print('Average Validation AUC: ', np.mean(lr_cv_results))

In [None]:
%%time
lr_mod = LogisticRegression(C=0.001, solver='liblinear')
lr_mod.fit(X_train, y_train)

print('Training Acc:', lr_mod.score(X_train, y_train))
print('Training AUC:', roc_auc_score(y_train, lr_mod.predict_proba(X_train)[:,1]))

### Generate Test Predictions

In [None]:
app_submission = test = pd.read_csv("../input/home-credit-default-risk/sample_submission.csv")
app_submission.head()

In [None]:
app_submission['TARGET'] = lr_mod.predict_proba(X_test)[:,1]
app_submission.head()

In [None]:
app_submission.sum()

In [None]:
app_submission.to_csv('my_submission.csv', index=False)