In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;">Pima Indian</h1>

## This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

<center><img
src="https://www.legendsofamerica.com/wp-content/uploads/2018/12/PimaIndiansCarloGentile1870.jpg" style="width:50%;height:50%;">
</center>

### Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
<br>

* **Pregnancies: Number of times pregnant**
* **Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test**
* **BloodPressure: Diastolic blood pressure (mm Hg)**
* **SkinThickness: Triceps skin fold thickness (mm)**
* **Insulin: 2-Hour serum insulin (mu U/ml)**
* **BMI: Body mass index (weight in kg/(height in m)^2)**
* **DiabetesPedigreeFunction: Diabetes pedigree function**
* **Age: Age (years**)
* **Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0**

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;">Load Data 📚</h1>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.float_format",lambda x: "%.5f" % x)
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df.head()

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;">Check Data 🔎</h1>

In [None]:
def check_df(dataframe):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(3))
    print("##################### Tail #####################")
    print(dataframe.tail(3))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

check_df(df)

### It is not possible for BMI and some variables to be zero. Values equal to zero in the data set are missing values. NA should be written instead of these values.

In [None]:
# We convert values with zero in variables to NaN values.
cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in cols:
    df[col].replace(0, np.NaN, inplace=True)

In [None]:
msno.bar(df);

In [None]:
msno.heatmap(df);

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;">Data Preprocessing 🛠️</h1>

In [None]:
# We can fill in the NaN values with a median relative to the target.
for col in df.columns:
    df.loc[(df["Outcome"] == 0) & (df[col].isnull()), col] = df[df["Outcome"] == 0][col].median()
    df.loc[(df["Outcome"] == 1) & (df[col].isnull()), col] = df[df["Outcome"] == 1][col].median()

In [None]:
# Outliers visualization
for col in df.columns:
    if col != "Outcome":
        sns.catplot("Outcome", col, data = df)

In [None]:
df.hist(figsize = (15,7));

In [None]:
# Outliers
def outlier_thresholds(dataframe, col_name, th1=0.05, th3=0.95):
    quartile1 = dataframe[col_name].quantile(th1)
    quartile3 = dataframe[col_name].quantile(th3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit


def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

def replace_with_thresholds(dataframe, col_name, th1=0.05, th3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name, th1, th3)
    if low_limit > 0:
        dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit
    else:
        dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit

In [None]:
# Numerical columns
num_cols = [col for col in df.columns if df[col].dtypes in [int, float]
            and df[col].nunique() > 10]

In [None]:
# Check Outliers
for col in df.columns:
    print(check_outlier(df, col))

In [None]:
# Replace Outliers
for col in df.columns:
    replace_with_thresholds(df, col)

In [None]:
# Check Outliers
for col in df.columns:
    print(check_outlier(df, col))

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;"> Feature Engineering ⚙️</h1>

In [None]:
def label_encoder(dataframe, binary_col):
    labelencoder = preprocessing.LabelEncoder()
    dataframe[binary_col] = labelencoder.fit_transform(dataframe[binary_col])
    return dataframe


def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe


def rare_analyser(dataframe, target, rare_perc):
    rare_columns = [col for col in dataframe.columns if dataframe[col].dtypes == 'O'
                    and (dataframe[col].value_counts() / len(dataframe) < rare_perc).any(axis=None)]

    for col in rare_columns:
        print(col, ":", len(dataframe[col].value_counts()))
        print(pd.DataFrame({"COUNT": dataframe[col].value_counts(),
                            "RATIO": dataframe[col].value_counts() / len(dataframe),
                            "TARGET_MEAN": dataframe.groupby(col)[target].mean()}), end="\n\n\n")


def rare_encoder(dataframe, rare_perc):
    temp_df = dataframe.copy()

    rare_columns = [col for col in temp_df.columns if temp_df[col].dtypes == 'O'
                    and (temp_df[col].value_counts() / len(temp_df) < rare_perc).any(axis=None)]

    for var in rare_columns:
        tmp = temp_df[var].value_counts() / len(temp_df)
        rare_labels = tmp[tmp < rare_perc].index
        temp_df[var] = np.where(temp_df[var].isin(rare_labels), 'Rare', temp_df[var])

    return temp_df

In [None]:
# New categorical BMI
df['NEW_BMI_CAT'] = pd.cut(x=df['BMI'], bins=[0, 18.4, 25.0, 30.0, 70.0],
                           labels=['weakness', 'normal', 'slightly_fat', 'obese']).astype('O')

# New categorical Glucose
df['NEW_GLUCOSE_CAT'] = pd.cut(x=df['Glucose'], bins=[0, 139, 200],
                               labels=['Normal', 'Prediabetes']).astype('O')

#  New categorical BloodPressure
df['NEW_BLOOD_CAT'] = pd.cut(x=df['BloodPressure'], bins=[0, 79, 90, 123],
                             labels=['Normal', 'Hypertension_S1', 'Hypertension_S2']).astype('O')

# New categorical SkinThickness
df['NEW_SKINTHICKNESS_CAT'] = df['SkinThickness'].apply(lambda x: 1 if x <= 18.0 else 0)

# New categorical Insulin
df['NEW_INSULIN_CAT'] = df['Insulin'].apply(lambda x: 'Normal' if 16.0 <= x <=166   else 'Abnormal')

In [None]:
df.head()

In [None]:
# Label Encoding
label_cols = [col for col in df.columns if df[col].dtypes == 'O' and df[col].nunique() <= 2]
for col in label_cols:
    label_encoder(df, col)

In [None]:
# One_hot Encoding
ohe_cols = [col for col in df.columns if 10 >= len(df[col].unique()) > 2]
df = one_hot_encoder(df, ohe_cols, drop_first=True)

In [None]:
df.columns = [col.upper() for col in df.columns]

In [None]:
df.head()

<a id='begin'></a>
# <h1 style="background-color:skyblue; font-family:newtimeroman; font-size:350%; text-align:center; border-radius: 15px 50px;"> Modeling 🧩</h1>

In [None]:
y = df[['OUTCOME']]
X = df.drop('OUTCOME', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

rf = RandomForestClassifier().fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [None]:
acc_random_forest = round(rf.score(X_test, y_pred) * 100, 2)
acc_random_forest

<center><img
src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTO00xMsiob_1AgrAfctXJ50--hHtXxBLg3uWJ1Guc4NGm9Y-61QnmuOagYXA2h0XaFkC0&usqp=CAU" style="width:50%;height:50%;">
</center>

# **Cross validation processes to prevent excessive learning.**

In [None]:
rf_params_ = {'max_depth': [3, 6, 10, None],
              'max_features': [3, 5, 15],
              'n_estimators': [100, 500, 700],
              'min_samples_split': [2, 5, 8],
              'min_samples_leaf': [1, 3, 5]}

rf_model = RandomForestClassifier(random_state=42)

rf_cv_model = RandomizedSearchCV(rf_model, rf_params_, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)
rf_cv_model = RandomForestClassifier(**rf_cv_model.best_params_).fit(X_train, y_train)
y_pred = rf_cv_model.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_pred) * 100, 2)
acc_random_forest