In [5]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score,roc_curve
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from warnings import filterwarnings
from skompiler import skompile
filterwarnings('ignore')

In [6]:
diabetes = pd.read_csv('diabetes.csv')
df = diabetes.copy()
df = df.dropna()
y = df['Outcome']
X = df.drop(['Outcome'],axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 44)

In [7]:
lgbm = LGBMClassifier()
lgbm_model = lgbm.fit(X_train, y_train)

In [9]:
y_pred = lgbm_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7229437229437229

In [10]:
#TUNING

In [11]:
lgbm_params = {
    'min_child_samples':[2, 5, 10, 20],
    'n_estimators':[100, 500, 1000, 2000],
    'max_depth':[3, 4, 5, 6],
    'learning_rate':[0.1, 0.01, 0.02, 0.05],
    'subsample':[0.6, 0.8, 1.0]
}

lgbm = LGBMClassifier()

In [12]:
lgbm_cv_model = GridSearchCV(lgbm, lgbm_params, cv = 10, n_jobs = -1, verbose = 2)

In [13]:
lgbm_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 768 candidates, totalling 7680 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   17.2s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 1061 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 1506 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 2033 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 2688 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 3377 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 4172 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 5045 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 5976 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 7101 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 7680 out of 7680 | elapsed:  8.6min finished


GridSearchCV(cv=10, estimator=LGBMClassifier(), n_jobs=-1,
             param_grid={'learning_rate': [0.1, 0.01, 0.02, 0.05],
                         'max_depth': [3, 4, 5, 6],
                         'min_child_samples': [2, 5, 10, 20],
                         'n_estimators': [100, 500, 1000, 2000],
                         'subsample': [0.6, 0.8, 1.0]},
             verbose=2)

In [14]:
lgbm_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 5,
 'min_child_samples': 10,
 'n_estimators': 100,
 'subsample': 0.6}

In [15]:
lgbm_tuned = LGBMClassifier(
     learning_rate = 0.02,
     max_depth = 5,
     min_child_samples = 10,
     n_estimators = 100,
     subsample = 0.6).fit(X_train, y_train)

In [16]:
y_pred = lgbm_tuned.predict(X_test)
accuracy_score(y_test, y_pred)

0.7705627705627706