In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

In [2]:
account = pd.read_csv('data/account.csv', delimiter=";")
card_test = pd.read_csv('data/card_test.csv', delimiter=";")
card_train = pd.read_csv('data/card_train.csv', delimiter=";")
client = pd.read_csv('data/client.csv', delimiter=";")
disp = pd.read_csv('data/disp.csv', delimiter=";")
district = pd.read_csv('data/district.csv', delimiter=";")
loan_test = pd.read_csv('data/loan_test.csv', delimiter=";")
loan_train = pd.read_csv('data/loan_train.csv', delimiter=";")
trans_test = pd.read_csv('data/trans_test.csv', delimiter=";")
trans_train = pd.read_csv('data/trans_train.csv', delimiter=";")


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Account
- account_id
- district_id
- date
- frequency

# Client
- client_id
- birth_number
- district_id

# Disposition
- disp_id
- client_id
- account_id
- type

# Loan
- loan_id
- account_id
- date
- amount
- duration
- payments
- status

# Transition
- trans_id
- account_id
- date
- type
- operation
- amount_balance
- k_symbol
- bank
- account

In [3]:
def renameColumns():
    account.rename({'date': 'account_date', 'district_id': 'account_district_id'}, axis=1, inplace = True)
    client.rename({'district_id': 'client_district_id'}, axis=1, inplace = True)
    loan_train.rename({'date': 'loan_date'}, axis=1, inplace = True)
    loan_test.rename({'date': 'loan_date'}, axis=1, inplace = True)

In [4]:
def getGender(df):
    list = []
    for row in df.itertuples(index = True):
        month = getMonth(row.birth_number)
        birth_number = row.birth_number
        gender = 'M'
        
        if(month > 50):
            birth_number -= 5000
            gender = 'F'
            
        list.append([row.client_id, birth_number, gender, row.client_district_id])
    
    return pd.DataFrame(list, columns=['client_id', 'birth_number', 'gender', 'client_district_id'])

In [5]:
def getMonth(date):
    remove_day = date //100
    month = remove_day % 100
    return month

In [6]:
def getLoansInfo(loan_df):
    client_processed = getGender(client)
    merge1 = pd.merge(client_processed, disp, on ='client_id')
    merge2 = pd.merge(merge1, account, on = 'account_id')
    loan_info = pd.merge(loan_df, merge2, on = 'account_id')
    return loan_info

In [7]:
renameColumns()
loans = getLoansInfo(loan_train)
loans

Unnamed: 0,loan_id,account_id,loan_date,amount,duration,payments,status,client_id,birth_number,gender,client_district_id,disp_id,type,account_district_id,frequency,account_date
0,5314,1787,930705,96396,12,8033,-1,2166,470722,F,30,2166,OWNER,30,weekly issuance,930322
1,5316,1801,930711,165960,36,4610,1,2181,680722,M,46,2181,OWNER,46,monthly issuance,930213
2,6863,9188,930728,127080,60,2118,1,11314,360602,M,45,11006,OWNER,45,monthly issuance,930208
3,5325,1843,930803,105804,36,2939,1,2235,400420,F,14,2235,OWNER,12,monthly issuance,930130
4,7240,11013,930906,274740,60,4579,1,13539,780907,M,63,13231,OWNER,1,weekly issuance,930214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,6818,9030,961212,155616,48,3242,1,11122,790610,M,72,10814,DISPONENT,72,monthly issuance,950121
399,5625,3189,961215,222180,60,3703,-1,3855,390320,M,29,3855,OWNER,29,monthly issuance,951129
400,6805,8972,961221,45024,48,938,1,11050,570504,F,70,10742,OWNER,70,monthly issuance,960521
401,7233,10963,961225,115812,36,3217,1,13480,530601,M,16,13172,OWNER,16,monthly issuance,950520


In [8]:
from sklearn import tree
X_train = loan_train.drop(columns='status')
y_train = loan_train['status'].copy()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

X_test = loan_test.drop(columns='status')
y_test = clf.predict(X_test)

In [9]:
predict_loan = {'Id': X_test["loan_id"], 'Predicted': y_test}
df = pd.DataFrame(data=predict_loan)
df.to_csv('loan_predict.csv', index=False)

In [10]:
def report(results, n_top=3):
    res = []
    res_i = 0
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            res.append([np.multiply(results['mean_test_score'][candidate], -1), 
                        results['std_test_score'][candidate]])
            for key in results['params'][candidate].keys():
                res[res_i].append(results['params'][candidate][key])
            res_i+=1
    
    columns = ['Mean absolute error', 'std'] + list(results['params'][candidate].keys())
    display(pd.DataFrame(res, columns=columns))

In [11]:
from sklearn.model_selection import RandomizedSearchCV, KFold

def find_best_params_kfold(model, X, y, param_grid, n_iter=10, n_splits=3):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    search = RandomizedSearchCV(model, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, n_iter=n_iter, cv=kfold, verbose=0, random_state=0)
    result = search.fit(X, y)
    
    report(result.cv_results_)

In [14]:
max_depth = range(2, 30, 1)
min_samples_split = range(1, 10, 1)

param_grid = dict(max_depth=max_depth, min_samples_split=min_samples_split)

find_best_params_kfold(tree.DecisionTreeClassifier(random_state=0), X_train, y_train, param_grid, 
                       n_iter=50, n_splits=10)

Unnamed: 0,Mean absolute error,std,min_samples_split,max_depth
0,0.292235,0.113927,5,4
1,0.310227,0.115396,9,6
2,0.316477,0.13403,9,2
3,0.316477,0.113644,2,6
4,0.316477,0.13403,6,2
5,0.316477,0.109941,4,3


In [15]:
classifier_cart = tree.DecisionTreeClassifier(max_depth=4, min_samples_split=5, random_state=0)
classifier_cart = classifier_cart.fit(X_train, y_train)

y_test_cart = classifier_cart.predict(X_test)
df = pd.DataFrame(data={'Id': X_test["loan_id"], 'Predicted': y_test_cart})
df.to_csv('loan_predict_cart.csv', index=False)

In [None]:
from sklearn.ensemble import BaggingClassifier
n_estimators = range(50, 1000, 50)
param_grid = dict(n_estimators=n_estimators)

find_best_params_kfold(BaggingClassifier(base_estimator=regr_cart,
                                        n_jobs=-1, random_state=0), 
                       X, y, param_grid, n_iter=30, n_splits=3)

0     -1
1      1
2      1
3      1
4      1
      ..
323    1
324   -1
325    1
326    1
327    1
Name: status, Length: 328, dtype: int64

Unnamed: 0,client_id,birth_number,client_district_id
0,1,706213,18
1,2,450204,1
2,3,406009,1
3,4,561201,5
4,5,605703,5
...,...,...,...
5364,13955,456030,1
5365,13956,430406,1
5366,13968,680413,61
5367,13971,626019,67


In [None]:
def cross_validationScores(modelName, model, X, y, n_folds=10):
    metrics = {'MAE': 'neg_mean_absolute_error', 
               'RMSE': 'neg_root_mean_squared_error'}
    
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)
    scores = cross_validate(model, X, y, cv=kfold, scoring=metrics, n_jobs=-1)

    # multiply by -1 because sklearn scoring metrics are negative
    mean_mae_score = np.multiply(scores['test_MAE'], -1).mean()
    std_mae_score = np.multiply(scores['test_MAE'], -1).std()
    mean_rmse_score = np.multiply(scores['test_RMSE'], -1).mean()
    std_rmse_score = np.multiply(scores['test_RMSE'], -1).std()

    model_score = pd.DataFrame([[mean_mae_score, mean_rmse_score]], 
                             columns=['MAE', 'RMSE'], index=[modelName])
    global models_mean_scores
    models_mean_scores = models_mean_scores.append(model_score)

    print(str(n_folds) + " fold Cross validation scores for " + modelName)
    score_df = pd.DataFrame([[mean_mae_score, std_mae_score], 
                                  [mean_rmse_score, std_rmse_score]], 
                                 columns=['Mean score', 'std'], 
                                 index=['MAE', 'RMSE'])
    display(score_df)