In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score



In [3]:
data = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
# number of column and rows
data.shape

(100000, 9)

In [5]:
data.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [7]:
# frist split into x & y
np.random.seed(42)
x = data.drop('diabetes', axis=1)
y = data['diabetes']

In [8]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['gender', 'smoking_history']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(x)


In [9]:

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

In [10]:
# find best model
models = {
    "Logistic Regression": LogisticRegression(),
    'K-Nearest': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB()
}

def fit_and_score(models, x_train, x_test, y_train, y_test):
    np.random.seed(42)
    model_score = {}
    for name, model in models.items():
        model.fit(x_train, y_train)
        model_score[name] = model.score(x_test, y_test)
    return model_score


In [11]:
model_scores = fit_and_score(models, x_train, x_test, y_train, y_test);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
model_scores

{'Logistic Regression': 0.9557,
 'K-Nearest': 0.9544,
 'Random Forest': 0.97005,
 'Decision Tree': 0.95245,
 'SVM': 0.94645,
 'Naive Bayes': 0.858}

In [13]:
print(f"The Accuracy of Random Forest classifier Model: %{model_scores['Random Forest']*100 }")

The Accuracy of Random Forest classifier Model: %97.005


In [14]:
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.9702

In [15]:

model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)

# Cross-validated accuracy
cv_rf = cross_val_score(model,
                       transformed_x,
                       y,
                       cv=10,
                       scoring='accuracy')
cv_acc = np.mean(cv_rf)

In [16]:
# Cross-validated precision
cv_precision = cross_val_score(model,
                       transformed_x,
                       y,
                       cv=10,
                       scoring='precision')
cv_precision = np.mean(cv_precision)


In [17]:
# Cross-validated recall
cv_recall = cross_val_score(model,
                       transformed_x,
                       y,
                       cv=10,
                       scoring='recall')
cv_recall = np.mean(cv_recall)


In [18]:
print(f"Cross-validated accuracy: %{cv_acc * 100}")
print(f"Cross-validated precision: {cv_precision}")
print(f"Cross-validated recall: {cv_recall}")

Cross-validated accuracy: %96.971
Cross-validated precision: 0.941815427959298
Cross-validated recall: 0.6869411764705882


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline 

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


# loading data
data = pd.read_csv('/kaggle/input/diabetes-prediction-dataset/diabetes_prediction_dataset.csv')


# frist split into x & y
np.random.seed(42)
x = data.drop('diabetes', axis=1)
y = data['diabetes']


# Turn the categories into numbers
categorical_features = ['gender', 'smoking_history']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_x = transformer.fit_transform(x)


x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)


model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)
cv_rf = cross_val_score(model,
                       transformed_x,
                       y,
                       cv=10,
                       scoring='accuracy')
np.mean(cv_rf)



0.9696299999999999