In [18]:
#importing dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.metrics import confusion_matrix


In [19]:
dataset=pd.read_csv('cleve.csv')
dataset.head()
X=dataset.iloc[:,:-1].values
y=dataset.iloc[:,13].values


In [20]:
#imputation for data cleaning i.e; replace missing values with mean values
from sklearn.impute import SimpleImputer
Imputation=SimpleImputer(strategy="mean",missing_values=np.nan, fill_value=None, verbose=0, copy=True, add_indicator=False)
X[:,11:13]=Imputation.fit_transform(X[:,11:13])



In [21]:
#divide dataset into train and test set
from sklearn.model_selection import train_test_split,GridSearchCV
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=9)

In [22]:
#Scaling is a technique of feature Scaling process
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler(with_mean=True,with_std=True)
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

In [23]:
#logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
lr=LogisticRegression(penalty="l2",solver="lbfgs",random_state=42)
lr.fit(X_train,y_train)
ypred=lr.predict(X_test)
print(accuracy_score(y_test,ypred),roc_auc_score(y_test,ypred))

0.8524590163934426 0.8516483516483517


In [25]:
##naive Bayes 
from sklearn.naive_bayes import BernoulliNB,GaussianNB
#Cb=GaussianNB()
Cb=BernoulliNB()
Cb.fit(X_train,y_train)
y_pred=Cb.predict(X_test)
print(accuracy_score(y_test,y_pred),roc_auc_score(y_test,y_pred))

0.8524590163934426 0.8516483516483517


In [26]:
#DecisionTree
from sklearn.tree import DecisionTreeClassifier
Dt=DecisionTreeClassifier(criterion='gini',max_depth=20,splitter="best",min_samples_split=3,random_state=42)
Dt.fit(X_train,y_train)
y_pred=Dt.predict(X_test)
print(accuracy_score(y_test,y_pred),roc_auc_score(y_test,y_pred))

0.7704918032786885 0.7604395604395604


In [27]:
#K Nearest negibhor
from sklearn.impute import KNNImputer
Kimputer=KNNImputer()
Kimputer.fit_transform(X_train,y_train)
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(metric='minkowski',algorithm='kd_tree')
KNN.fit(X_train,y_train)
y_pred=KNN.predict(X_test)
print(accuracy_score(y_test,y_pred),roc_auc_score(y_test,y_pred))

0.8360655737704918 0.8373626373626373


In [28]:
#Support vector Classifier
from sklearn.svm import LinearSVC
svc=LinearSVC(penalty='l2',C=2,verbose=0,max_iter=10000)
svc.fit(X_train,y_train)
y_pred1=svc.predict(X_test)
print(accuracy_score(y_test,y_pred1),roc_auc_score(y_test,y_pred1))


0.8524590163934426 0.8516483516483517


In [29]:
#Support Vector Regression
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error,r2_score,f1_score
svr=LinearSVR(dual=True,max_iter=10000)
svr.fit(X_train,y_train)
y_pred=svr.predict(X_test)
print(mean_squared_error(y_test,y_pred),r2_score(y_test,y_pred))
print(roc_auc_score(y_test,y_pred))
print(accuracy_score(y_test,[round(i) for i in y_pred]))

0.2601383857786869 -0.06370871811263079
0.8362637362637362
0.7377049180327869


In [31]:
#HRFLM
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
rf=RandomForestClassifier(n_estimators=2,min_samples_split=2,random_state=42)
lr=LogisticRegression(penalty="l2",solver="lbfgs")
enc=OneHotEncoder()
rf.fit(X_train,y_train)
enc.fit(rf.apply(X_train))
lr.fit(enc.transform(rf.apply(X_test)),y_test)
y_pred=lr.predict_proba(enc.transform(rf.apply(X_test)))[:,1]
print(accuracy_score(y_test,[round(i) for i in y_pred]))

0.9180327868852459


In [35]:
params = {
    'num_leaves': [10, 30, 40, 50, 60, 70],
    'learning_rate': [0.05, 0.01],
    'n_estimators': [100, 300, 500],
    'subsample': [0.95],
    'colsample_bytree': [0.95],
    'n_jobs': [7],
    'random_state': [42]
}
from sklearn.metrics import roc_auc_score,roc_curve,r2_score,mean_squared_error

gcv = GridSearchCV(LGBMClassifier(), params, cv=5, verbose=1).fit(X_train, y_train)
gbm = gcv.best_estimator_
gbm_pred = gbm.predict(X_test)
print("GBDT ROCAUC score: {:.2f}".format(roc_auc_score(y_test, gbm_pred)))
print(confusion_matrix(y_test,gbm_pred))
print(accuracy_score(y_test,gbm_pred))
print(mean_squared_error(y_test,gbm_pred),r2_score(y_test,gbm_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GBDT ROCAUC score: 0.84
[[29  6]
 [ 4 22]]
0.8360655737704918
0.16393442622950818 0.3296703296703296


[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:    9.7s finished


In [36]:
##GBDT+LR
from sklearn.ensemble import GradientBoostingClassifier
X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.2)
from sklearn.preprocessing import OneHotEncoder
grd = GradientBoostingClassifier(n_estimators=10)
grd_enc = OneHotEncoder()
grd_lm = LogisticRegression(solver='lbfgs', max_iter=1000)

grd.fit(X_train, y_train)
grd_enc.fit(grd.apply(X_train)[:, :, 0])
grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)
 
y_pred_grd_lm = grd_lm.predict_proba(
grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)
print("The AUC of GBT+LR is:", roc_auc_score(y_test, y_pred_grd_lm))

l=[round(i) for i in y_pred_grd_lm]
print(accuracy_score(y_test,l))


The AUC of GBT+LR is: 0.8516483516483517
0.7704918032786885


In [47]:
import xgboost as xgb
xgb = xgb.XGBClassifier(nthread=4,     #Meaning: When nthread=-1, use all CPUs for parallel operation (default), when nthread=1, use 1 CPU for operation.
                          learning_rate=0.001,    #Meaning: learning rate, which controls the step size when the weight is updated in each iteration, the default is 0.3. Tuning: The smaller the value, the slower the training. The typical value is 0.01-0.2.
                          n_estimators=50,       #Meaning: The total number of iterations, that is, the number of decision trees
                          max_depth=5,         #Meaning: the depth of the tree, the default value is 6, the typical value is 3-10. Parameter tuning: the larger the value, the easier it is to overfit; the smaller the value, the easier it is to underfit
                          gamma=0,               #Meaning: The penalty term coefficient specifies the minimum loss function drop value required for node splitting.
                          subsample=0.9,       #Meaning: When training each tree, the proportion of data used in the total training set. The default value is 1, and the typical value is 0.5-1. Tuning: Prevent overfitting.
                          colsample_bytree=0.5,use_label_encoder=False) #When training each tree, the proportion of features used in total features. The default value is 1, and the typical value is 0.5-1. Tuning: Prevent overfitting.
 
xgb_enc = OneHotEncoder()
xgb_lm = LogisticRegression(solver='lbfgs', max_iter=10000)
xgb.fit(X_train, y_train)
xgb_enc.fit(xgb.apply(X_train))
xgb_lm.fit(xgb_enc.transform(xgb.apply(X_train_lr)), y_train_lr) 
y_pred_xgb_lm = xgb_lm.predict_proba(xgb_enc.transform(xgb.apply(X_test)))[:, 1]

fpr_xgb_lm, tpr_xgb_lm, _ = roc_curve(y_test, y_pred_xgb_lm)
#print("The AUC of xgboost+LR is:", roc_auc_score(y_test, y_pred_xgb_lm))
l=[round(i) for i in y_pred_xgb_lm]
print(accuracy_score(y_test,l))
#print(mean_squared_error(y_test,y_pred_xgb_lm),r2_score(y_test,y_pred_xgb_lm))

0.9016393442622951
