In [1]:
import numpy as np
import pandas as pd
pd.pandas.set_option("display.max_columns",None)

In [2]:
data= pd.read_csv("train.csv",na_values=["NA","?"])

In [3]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
data.shape

(614, 13)

In [5]:
data.drop("Loan_ID",axis=1,inplace=True)

In [6]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [7]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
cols_with_nan= [feature for feature in data.columns if data[feature].isnull().sum()>1]

for feature in cols_with_nan:
    print(f"{feature} \t {np.round(data[feature].isnull().mean()*100,2)}% missing values")

Gender 	 2.12% missing values
Married 	 0.49% missing values
Dependents 	 2.44% missing values
Self_Employed 	 5.21% missing values
LoanAmount 	 3.58% missing values
Loan_Amount_Term 	 2.28% missing values
Credit_History 	 8.14% missing values


In [9]:
# Handling Categorical missing values

cat_feat_nan= [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtype=="O"]

print("[INFO] loading categorical features having nan values....")

for feature in cat_feat_nan:
    print(feature)

[INFO] loading categorical features having nan values....
Gender
Married
Dependents
Self_Employed


In [10]:
def replace_cat_feature(dataset,feat_nan):
    data=dataset.copy()
    data[feat_nan]= data[feat_nan].fillna("Missing")
    return data

print("[INFO] Filling Nan values....")
data= replace_cat_feature(data,cat_feat_nan)

[INFO] Filling Nan values....


In [11]:
data[cat_feat_nan].isnull().sum()

Gender           0
Married          0
Dependents       0
Self_Employed    0
dtype: int64

In [12]:
data[cat_feat_nan].head(3)

Unnamed: 0,Gender,Married,Dependents,Self_Employed
0,Male,No,0,No
1,Male,Yes,1,No
2,Male,Yes,0,Yes


In [13]:
# Handling numerical features having nan values

num_feat_nan= [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtype!="O"]

print("[INFO] loading numerical features with nan....\n")

for feature in num_feat_nan:
    print(f"{feature}\t{np.round(data[feature].isnull().mean(),2)}% missing values")

[INFO] loading numerical features with nan....

LoanAmount	0.04% missing values
Loan_Amount_Term	0.02% missing values
Credit_History	0.08% missing values


In [14]:
data["LoanAmount"].fillna(data["LoanAmount"].median(),inplace=True)
data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"],inplace=True)
data["Credit_History"].fillna(data["Credit_History"].median(),inplace=True)

In [15]:
for feature in num_feat_nan:
    print(f"{feature}\t{np.round(data[feature].isnull().mean(),2)}% missing values")

LoanAmount	0.0% missing values
Loan_Amount_Term	0.02% missing values
Credit_History	0.0% missing values


In [16]:
data.isnull().sum()

Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

In [17]:
data.head(5)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [18]:
data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].median(),inplace=True)

In [19]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [20]:
X= data.drop(["Loan_Status"],axis=1)

In [21]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


In [22]:
X["Dependents"].value_counts()

0          345
1          102
2          101
3+          51
Missing     15
Name: Dependents, dtype: int64

In [23]:
y= data.iloc[:,[-1]]

In [24]:
y.head()

Unnamed: 0,Loan_Status
0,Y
1,N
2,Y
3,Y
4,Y


In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
lb= LabelEncoder()

In [27]:
X["Gender"]= lb.fit_transform(X["Gender"])
X["Married"]= lb.fit_transform(X["Married"])
X["Education"]= lb.fit_transform(X["Education"])
X["Property_Area"]= lb.fit_transform(X["Property_Area"])
X["Self_Employed"]= lb.fit_transform(X["Self_Employed"])
X["Dependents"]= lb.fit_transform(X["Dependents"])

In [28]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,1,5849,0.0,128.0,360.0,1.0,2
1,1,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0
2,1,2,0,0,2,3000,0.0,66.0,360.0,1.0,2
3,1,2,0,1,1,2583,2358.0,120.0,360.0,1.0,2
4,1,1,0,0,1,6000,0.0,141.0,360.0,1.0,2


In [29]:
# Doing One Hot Encoding for all the categorical features

# gender= pd.get_dummies(X["Gender"],drop_first=True)
# married= pd.get_dummies(X["Married"],drop_first=True)
# education= pd.get_dummies(X["Education"],drop_first=True)
# property_area= pd.get_dummies(X["Property_Area"],drop_first=True)
# employed= pd.get_dummies(X["Self_Employed"],drop_first=True)
# dependents= pd.get_dummies(X["Dependents"],drop_first=True)

In [30]:
# X= pd.concat([X,gender],axis=1)

In [31]:
# X= pd.concat([X,married,education],axis=1)

In [32]:
# X= pd.concat([X,property_area],axis=1)

In [33]:
# X= pd.concat([X,employed],axis=1)

In [34]:
# X= pd.concat([X,dependents],axis=1)

In [35]:
X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,1,5849,0.0,128.0,360.0,1.0,2
1,1,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0
2,1,2,0,0,2,3000,0.0,66.0,360.0,1.0,2
3,1,2,0,1,1,2583,2358.0,120.0,360.0,1.0,2
4,1,1,0,0,1,6000,0.0,141.0,360.0,1.0,2


In [36]:
X.shape

(614, 11)

In [37]:
# loan= pd.get_dummies(y["Loan_Status"],prefix="Loan",drop_first=True)
y= lb.fit_transform(y)

  y = column_or_1d(y, warn=True)


In [38]:
# y= pd.concat([y,loan],axis=1)

In [39]:
# y.drop(["Loan_Status"],axis=1,inplace=True)

In [40]:
y

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,

In [41]:
from sklearn.preprocessing import MinMaxScaler

In [42]:
scaler= MinMaxScaler()

In [43]:
X= scaler.fit_transform(X)

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [46]:
X_train.shape,X_test.shape

((491, 11), (123, 11))

# XGBoost

In [47]:
import xgboost

In [48]:
xg= xgboost.XGBClassifier()

In [49]:
xg.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [50]:
xg.score(X_test,y_test)*100

75.60975609756098

In [51]:
xg.score(X_train,y_train)*100

87.37270875763747

# Random Forest

In [52]:
from sklearn.ensemble import RandomForestRegressor

In [53]:
rg= RandomForestClassifier(n_estimators=400,n_jobs=-1)

<IPython.core.display.Javascript object>

In [54]:
rg.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [55]:
rg.score(X_test,y_test)*100

76.42276422764228

In [56]:
rg.score(X_train,y_train)

1.0

# Logisitic Regression

In [57]:
from sklearn.linear_model import LogisticRegression

In [58]:
lg= LogisticRegression()

In [59]:
lg.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [60]:
lg.score(X_test,y_test)*100

78.86178861788618

# SVM

In [61]:
from sklearn.svm import SVC

In [62]:
sv= SVC()

In [63]:
sv.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [64]:
sv.score(X_test,y_test)*100

78.86178861788618

In [65]:
sv.score(X_train,y_train)

0.8167006109979633

# Neural Network

In [66]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [139]:
model= Sequential()
model.add(Dense(16,activation="relu",input_dim=11,kernel_initializer="he_uniform"))
model.add(Dense(8,activation="relu",kernel_initializer="he_uniform"))
model.add(Dense(1,activation="sigmoid",kernel_initializer="he_uniform"))

In [140]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                192       
_________________________________________________________________
dense_4 (Dense)              (None, 8)                 136       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 9         
Total params: 337
Trainable params: 337
Non-trainable params: 0
_________________________________________________________________


In [141]:
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])

In [142]:
model.fit(X_train,y_train,epochs=200,validation_data=(X_test,y_test))

Train on 491 samples, validate on 123 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
E

Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 

<tensorflow.python.keras.callbacks.History at 0x2261f0d12e8>

# Hyperparamter Tuning

In [52]:
# Hyperparameter tuning

classifier= xgboost.XGBClassifier()

In [53]:
classifier.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [54]:
booster= ["gbtree","gblinear"]
base_score= [0.25,0.5,0.75,1]
n_estimators= [20,50,80,100,200,500,900]
max_depth= [2,3,5,10,15]
learning_rate=[0.05,0.001,0.1,1.0,0.15,0.20]
min_child_weight= [1,2,3,4]

In [55]:
hyperparameter_grid= {
    "n_estimators":n_estimators,
    "max_depth":max_depth,
    "booster":booster,
    "base_score":base_score,
    "learning_rate":learning_rate,
    "min_child_weight":min_child_weight,
}
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

In [56]:
random_cv= RandomizedSearchCV(estimator=classifier,
                             param_distributions=hyperparameter_grid,
                             cv=5,
                             n_iter=50,
                             
                             n_jobs=4,
                             verbose=5,
                             return_train_score=True,
                             random_state=42)

random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 108 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed:    8.0s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=None, n_estimators=100,
                                           n_jobs=1, nthread=None,
                                           objective='binary:logistic',
                                           random_state=0, reg_alpha=0,
                                           reg_lambda=1, sc...
                   iid='deprecated', n_iter=50, n_jobs=4,
                   param_distributions={'base_score': [0.25, 0.5, 0.75, 1],
                                        'bo

In [57]:
random_cv.best_estimator_

XGBClassifier(base_score=0.75, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=20, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [58]:
classifier= xgboost.XGBClassifier(base_score=0.75, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=20, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [59]:
classifier.fit(X_train,y_train)

XGBClassifier(base_score=0.75, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=20, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [60]:
classifier.score(X_test,y_test)*100

78.86178861788618

# Testing Data

In [94]:
test_data= pd.read_csv("test.csv",na_values=["?","NA"])

In [95]:
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [96]:
id_df= pd.DataFrame(test_data["Loan_ID"])

In [97]:
id_df

Unnamed: 0,Loan_ID
0,LP001015
1,LP001022
2,LP001031
3,LP001035
4,LP001051
...,...
362,LP002971
363,LP002975
364,LP002980
365,LP002986


In [98]:
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [99]:
test_data["LoanAmount"].fillna(test_data["LoanAmount"].median(),inplace=True)
test_data["Loan_Amount_Term"].fillna(test_data["Loan_Amount_Term"],inplace=True)
test_data["Credit_History"].fillna(test_data["Credit_History"].median(),inplace=True)

In [100]:
test_data.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      6
Credit_History        0
Property_Area         0
dtype: int64

In [101]:
# Handling Categorical missing values

test_cat_feat_nan= [feature for feature in test_data.columns if test_data[feature].isnull().sum()>1 and test_data[feature].dtype=="O"]

print("[INFO] loading categorical features having nan values....")

for feature in test_cat_feat_nan:
    print(feature)

[INFO] loading categorical features having nan values....
Gender
Dependents
Self_Employed


In [102]:
def replace_cat_feature(dataset,feat_nan):
    test_data=dataset.copy()
    test_data[feat_nan]= test_data[feat_nan].fillna("Missing")
    return data

print("[INFO] Filling Nan values....")
test_data= replace_cat_feature(test_data,test_cat_feat_nan)

[INFO] Filling Nan values....


In [103]:
test_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [104]:
test_data["Loan_Amount_Term"].fillna(data["Loan_Amount_Term"].median(),inplace=True)

In [105]:
test_data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [106]:
test_data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,1,5849,0.0,128.0,360.0,1.0,2
1,1,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0
2,1,2,0,0,2,3000,0.0,66.0,360.0,1.0,2
3,1,2,0,1,1,2583,2358.0,120.0,360.0,1.0,2
4,1,1,0,0,1,6000,0.0,141.0,360.0,1.0,2


In [107]:
from sklearn.preprocessing import LabelEncoder
lb= LabelEncoder()

In [108]:
cols=["Gender","Married","Education","Self_Employed","Property_Area"]
for i in range(0,5):
    test_data[cols[i]]= lb.fit_transform(test_data[cols[i]])
    i+=1  # this is correct code

In [109]:
test_data["Dependents"]= lb.fit_transform(test_data["Dependents"])

In [110]:
test_data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,1,1,0,0,1,5849,0.0,128.0,360.0,1.0,2
1,1,2,1,0,1,4583,1508.0,128.0,360.0,1.0,0
2,1,2,0,0,2,3000,0.0,66.0,360.0,1.0,2
3,1,2,0,1,1,2583,2358.0,120.0,360.0,1.0,2
4,1,1,0,0,1,6000,0.0,141.0,360.0,1.0,2


In [112]:
mn= MinMaxScaler()
test_data= mn.fit_transform(test_data)
test_data

array([[0.5       , 0.5       , 0.        , ..., 0.74358974, 1.        ,
        1.        ],
       [0.5       , 1.        , 0.25      , ..., 0.74358974, 1.        ,
        0.        ],
       [0.5       , 1.        , 0.        , ..., 0.74358974, 1.        ,
        1.        ],
       ...,
       [0.5       , 1.        , 0.25      , ..., 0.74358974, 1.        ,
        1.        ],
       [0.5       , 1.        , 0.5       , ..., 0.74358974, 1.        ,
        1.        ],
       [0.        , 0.5       , 0.        , ..., 0.74358974, 0.        ,
        0.5       ]])

In [113]:
test_data

array([[0.5       , 0.5       , 0.        , ..., 0.74358974, 1.        ,
        1.        ],
       [0.5       , 1.        , 0.25      , ..., 0.74358974, 1.        ,
        0.        ],
       [0.5       , 1.        , 0.        , ..., 0.74358974, 1.        ,
        1.        ],
       ...,
       [0.5       , 1.        , 0.25      , ..., 0.74358974, 1.        ,
        1.        ],
       [0.5       , 1.        , 0.5       , ..., 0.74358974, 1.        ,
        1.        ],
       [0.        , 0.5       , 0.        , ..., 0.74358974, 0.        ,
        0.5       ]])

In [114]:
y_pred= classifier.predict(test_data)

In [115]:
y_pred

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,

In [116]:
pred1_df= pd.DataFrame(y_pred,columns=["Loan_Status"])

In [None]:
predq_df

In [90]:
upload2= pd.concat([id_df,pred_df],axis=1)

In [91]:
upload2

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1
...,...,...
609,,1
610,,1
611,,1
612,,1


In [92]:
upload2.to_csv("xg1.csv",index=False)

In [93]:
pd.read_csv("xg1.csv")

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,1
1,LP001022,1
2,LP001031,1
3,LP001035,1
4,LP001051,1
...,...,...
609,,1
610,,1
611,,1
612,,1
