## Kaggle home_credit_default_risk challenge
1. first look into the basic data and preparation
2. logistic regression 
3. random forest classifier
4. logistic regression on principal components

In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
import random
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

from sklearn.decomposition import PCA

from sklearn import preprocessing
#from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
% matplotlib inline
from matplotlib import mlab

In [None]:
def train_test_fn(df , proportion):

    msk = np.random.rand(len(df)) < proportion

    train = df[msk]

    test = df[~msk]
    
    return train, test

def confusion_matrix_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    c00 = sum((1-truth)*(1-result))
    tab = np.matrix([[c11, c10], [c01, c00]])
    return tab

def precision_fn(truth,result):
    c11 = sum(truth*result)
    #c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    #c00 = sum((1-truth)*(1-result))
    prec = c11/(c11+c01)
    return prec

def recall_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    #c01 = sum((1-truth)*result)
    #c00 = sum((1-truth)*(1-result))
    rec = c11/(c11+c10)
    return rec

def F_score_fn(truth,result):
    c11 = sum(truth*result)
    c10 = sum(truth*(1-result))
    c01 = sum((1-truth)*result)
    prec = c11/(c11+c01)
    rec = c11/(c11+c10)
    F=2/(1/prec+1/rec)
    
    return F


# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

### 1. first look into the data and preparation

In [None]:
df = pd.read_csv("./application_train.csv", sep = ',')
df.head()

In [None]:
missing_values_table(df)

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in df:
    if df[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(df[col].unique())) <= 2:
            # Train on the training data
            le.fit(df[col])
            # Transform both training and testing data
            df[col] = le.transform(df[col])
            df[col] = le.transform(df[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [None]:
df = pd.get_dummies(df)

### Optional data preprocessing

In [None]:
# normalization
# load the iris dataset
#iris = load_iris()
#print(iris.data.shape)
# separate the data from the target attributes
#X = iris.data
#y = iris.target
# normalize the data attributes
normalized_X = preprocessing.normalize(X)

In [None]:
# Standardize the data attributes for the Iris dataset.
# load the Iris dataset
#iris = load_iris()
#print(iris.data.shape)
# separate the data and target attributes
#X = iris.data
#y = iris.target
# standardize the data attributes
standardized_X = preprocessing.scale(X)

In [None]:
enc = OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  
OneHotEncoder(categorical_features='all', dtype='numpy.float64',
       handle_unknown='error', n_values='auto', sparse=True)
enc.n_values_

In [None]:
enc.feature_indices_

In [None]:
enc.transform([[0, 1, 1]]).toarray()

In [None]:
np.isnan(df).sum()

In [None]:
df['AMT_ANNUITY'].fillna(df.AMT_ANNUITY.median() , inplace = True)

In [None]:
df.fillna('missing', inplace = True)

### 2. logistic regression

In [None]:
train, test = train_test_fn(df, 0.6)

In [None]:
#X = train[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']]
X = train
y = train.pop('TARGET')
#y = train['TARGET']

#X_test = test[['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']]
#y_test = test['TARGET']
X_test = test
y_test = test.pop('TARGET')

In [None]:
logisticRegr = LogisticRegression(penalty = 'l2' ,solver = 'liblinear')
logistic_model = logisticRegr.fit(X, y) 

print('Accuracy:', np.mean(np.equal(logistic_model.predict(X_test),y_test)) )

In [None]:
df.TARGET.mean()

In [None]:
y.mean()

In [None]:
y_test.sum()

In [None]:
confusion_matrix_fn(logistic_model.predict(X_test),y_test)

In [None]:
precision_fn(logistic_model.predict(X_test),y_test)

In [None]:
recall_fn(logistic_model.predict(X_test),y_test)

In [None]:
F_score_fn(logistic_model.predict(X_test),y_test)

In [None]:
plt.figure()
plt.scatter(x=X['AMT_INCOME_TOTAL'] , y=y)
plt.show()

In [None]:
plt.figure()
plt.scatter(x=X['AMT_CREDIT'] , y=y)
plt.show()

In [None]:
plt.figure()
plt.scatter(x=X['AMT_ANNUITY'] , y=y)
plt.show()

In [None]:
plt.figure()
plt.scatter(x=X['AMT_ANNUITY']/X['AMT_INCOME_TOTAL'] , y=y)
plt.show()

In [None]:
plt.figure()
plt.scatter(x=X['AMT_CREDIT']/X['AMT_ANNUITY'] , y=y)
plt.show()

### 4. random forest classifier

In [None]:
clf = RandomForestClassifier(n_estimators=1000 , max_depth=10, random_state=0)
clf.fit(X, y)

In [None]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            #min_impurity_decrease=0.0, #min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [None]:
print(clf.feature_importances_)

In [None]:
print(clf.predict([[20000, 500000, 100]]))

In [None]:
clf.oob_score_

In [None]:
clf.predict(normalized_X).mean()

In [None]:
confusion_matrix_fn(clf.predict(normalized_X),y)

In [None]:
precision_fn(clf.predict(X_test),y_test)

### 3. logistic regression on principal components

In [None]:
pca = PCA(n_components=30)
pca.fit(X)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
pca.components_

In [None]:
X_map = pca.transform(X)

In [None]:
logisticRegr = LogisticRegression(penalty = 'l2' ,solver = 'liblinear')
logistic_model = logisticRegr.fit(X_map, y) 

In [None]:
pca = PCA(n_components=30)
pca.fit(X_test)
X_test_map = pca.transform(X)

In [None]:
print('Accuracy:', np.mean(np.equal(logistic_model.predict(X_test_map),y_test)) )