In [64]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')

%matplotlib inline

In [65]:
# Cargamos los datos
train = pd.read_csv('train_ctrUa4K.csv')
test = pd.read_csv('test_lAUu6dG.csv')

# Copias de los datos originales
train_original = train.copy()
test_original = test.copy()

In [66]:
# Reemplazando los valores de las variables categoricas
train['Loan_Status'].replace('N', 0, inplace=True)
train['Loan_Status'].replace('Y', 1, inplace=True)
train['Dependents'].replace('3+', 3, inplace=True)
train['Gender'].replace('Male', 0, inplace=True)
train['Gender'].replace('Female', 1, inplace=True)
train['Married'].replace('No', 0, inplace=True)
train['Married'].replace('Yes', 1, inplace=True)
train['Education'].replace('Not Graduate', 0, inplace=True)
train['Education'].replace('Graduate', 1, inplace=True)
train['Self_Employed'].replace('No', 0, inplace=True)
train['Self_Employed'].replace('Yes', 1, inplace=True)
train['Property_Area'].replace('Rural', 0, inplace=True)
train['Property_Area'].replace('Semiurban', 1, inplace=True)
train['Property_Area'].replace('Urban', 2, inplace=True)

test['Dependents'].replace('3+', 3, inplace=True)
test['Gender'].replace('Male', 0, inplace=True)
test['Gender'].replace('Female', 1, inplace=True)
test['Married'].replace('No', 0, inplace=True)
test['Married'].replace('Yes', 1, inplace=True)
test['Education'].replace('Not Graduate', 0, inplace=True)
test['Education'].replace('Graduate', 1, inplace=True)
test['Self_Employed'].replace('No', 0, inplace=True)
test['Self_Employed'].replace('Yes', 1, inplace=True)
test['Property_Area'].replace('Rural', 0, inplace=True)
test['Property_Area'].replace('Semiurban', 1, inplace=True)
test['Property_Area'].replace('Urban', 2, inplace=True)

In [67]:
# Ver datos agrupados de una columna
train.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,0.0,0.0,0,1,0.0,5849,0.0,,360.0,1.0,2,1
1,LP001003,0.0,1.0,1,1,0.0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,0.0,1.0,0,1,1.0,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,0.0,1.0,0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,0.0,0.0,0,1,0.0,6000,0.0,141.0,360.0,1.0,2,1


In [68]:
test

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,0.0,1,0,1,0.0,5720,0,110.0,360.0,1.0,2
1,LP001022,0.0,1,1,1,0.0,3076,1500,126.0,360.0,1.0,2
2,LP001031,0.0,1,2,1,0.0,5000,1800,208.0,360.0,1.0,2
3,LP001035,0.0,1,2,1,0.0,2340,2546,100.0,360.0,,2
4,LP001051,0.0,0,0,0,0.0,3276,0,78.0,360.0,1.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
362,LP002971,0.0,1,3,0,1.0,4009,1777,113.0,360.0,1.0,2
363,LP002975,0.0,1,0,1,0.0,4158,709,115.0,360.0,1.0,2
364,LP002980,0.0,0,0,1,0.0,3250,1993,126.0,360.0,,1
365,LP002986,0.0,1,0,1,0.0,5000,2393,158.0,360.0,1.0,0


In [69]:
# Llenando los valores faltantes
train['Gender'].fillna(train['Gender'].mode()[0], inplace=True)
train['Married'].fillna(train['Married'].mode()[0], inplace=True)
train['Dependents'].fillna(train['Dependents'].mode()[0], inplace=True)
train['Self_Employed'].fillna(train['Self_Employed'].mode()[0], inplace=True)
train['Credit_History'].fillna(train['Credit_History'].mode()[0], inplace=True)

train['Loan_Amount_Term'].fillna(train['Loan_Amount_Term'].mode()[0], inplace=True)
train['LoanAmount'].fillna(train['LoanAmount'].median(), inplace=True)

In [70]:
# eliminando la columna Loan_ID
train = train.drop('Loan_ID', axis=1)
test = test.drop('Loan_ID', axis=1)

In [71]:
# Se designa la variable objetivo y las variables independientes
X = train.drop('Loan_Status', axis=1)
y = train.Loan_Status

In [72]:
X = pd.get_dummies(X)  # se convierten los datos categoricos en numericos
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [73]:
# Se divide el conjunto de datos en entrenamiento y validacion
from sklearn.model_selection import train_test_split
x_train, x_cv, y_train, y_cv = train_test_split(X, y, test_size=0.3)

In [74]:
# para hacer la validacion cruzada
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score  # para ver la precision del modelo
from xgboost import XGBClassifier

# se crea el modelo
i = 1
kf = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
for train_index, test_index in kf.split(X, y):
    print('\n{} of kfold {}'.format(i, kf.n_splits))
    xtr, xvl = X.loc[train_index], X.loc[test_index]
    ytr, yvl = y[train_index], y[test_index]
    model = XGBClassifier(n_estimators=50, max_depth=4)
    model.fit(xtr, ytr)
    pred_test = model.predict(xvl)
    score = accuracy_score(yvl, pred_test)
    print('accuracy_score', score)
    i += 1
    pred_test = model.predict(test)
    pred3 = model.predict_proba(test)[:, 1]
# Aqui se muestran los 5 mejores modelos de acuerdo a su precision


1 of kfold 5
accuracy_score 0.8130081300813008

2 of kfold 5
accuracy_score 0.7967479674796748

3 of kfold 5
accuracy_score 0.8130081300813008

4 of kfold 5
accuracy_score 0.7723577235772358

5 of kfold 5
accuracy_score 0.7540983606557377


In [75]:
X

Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Dependents_3,Dependents_0,Dependents_1,Dependents_2
0,0.0,0.0,1,0.0,5849,0.0,128.0,360.0,1.0,2,False,True,False,False
1,0.0,1.0,1,0.0,4583,1508.0,128.0,360.0,1.0,0,False,False,True,False
2,0.0,1.0,1,1.0,3000,0.0,66.0,360.0,1.0,2,False,True,False,False
3,0.0,1.0,0,0.0,2583,2358.0,120.0,360.0,1.0,2,False,True,False,False
4,0.0,0.0,1,0.0,6000,0.0,141.0,360.0,1.0,2,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,1.0,0.0,1,0.0,2900,0.0,71.0,360.0,1.0,0,False,True,False,False
610,0.0,1.0,1,0.0,4106,0.0,40.0,180.0,1.0,0,True,False,False,False
611,0.0,1.0,1,0.0,8072,240.0,253.0,360.0,1.0,2,False,False,True,False
612,0.0,1.0,1,0.0,7583,0.0,187.0,360.0,1.0,2,False,False,False,True


In [76]:
# se hacen las predicciones con los datos de prueba
pred_cv = model.predict(test)
if pred_cv[0] == 0:
    print('No Aprobado')
else:
    print('Aprobado')

Aprobado


Unnamed: 0,Gender,Married,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Dependents_3,Dependents_0,Dependents_1,Dependents_2
0,0.0,1,1,0.0,5720,0,110.0,360.0,1.0,2,False,True,False,False
1,0.0,1,1,0.0,3076,1500,126.0,360.0,1.0,2,False,False,True,False
2,0.0,1,1,0.0,5000,1800,208.0,360.0,1.0,2,False,False,False,True
3,0.0,1,1,0.0,2340,2546,100.0,360.0,,2,False,False,False,True
4,0.0,0,0,0.0,3276,0,78.0,360.0,1.0,2,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,0.0,1,0,1.0,4009,1777,113.0,360.0,1.0,2,True,False,False,False
363,0.0,1,1,0.0,4158,709,115.0,360.0,1.0,2,False,True,False,False
364,0.0,0,1,0.0,3250,1993,126.0,360.0,,1,False,True,False,False
365,0.0,1,1,0.0,5000,2393,158.0,360.0,1.0,0,False,True,False,False
