In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df=pd.read_csv("/kaggle/input/100000-diabetes-clinical-dataset/diabetes_dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['smoking_history'].value_counts()

In [None]:
df.isnull().sum()

# Encoding categorical data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le=LabelEncoder()
df['gender']=le.fit_transform(df['gender'])
df['age']=le.fit_transform(df['age'])
df['location']=le.fit_transform(df['location'])
df = pd.get_dummies(df, columns=['smoking_history'], prefix='smoking_history')

In [None]:
df.head()

# Descriptive Analysis

In [None]:
summary=df.describe()
summary

## key insights

**Year**:This dataset includes year range from 2015-2022 with the mean year being 2018.The median year is 2019 with most data points.

**Gender**: There are more females than males in the dataset.

**age**:The age range is from 0 to 101 years, with a mean age of 62.67 years.The median age is 64 years, indicating that half of the individuals are younger than 64 years and half are older.

**Race distribution**: 
* African American: About 20.2%
* Asian: About 20.0%
* Caucasian: About 19.9%
* Hispanic: About 19.9%
* Other: About 20.0%

**Hypertension:**: Approximately 7.5% of individuals have hypertension.

**Heart Disease**: Approximately 3.9% of individuals have heart disease.

**BMI:**: The mean BMI is 27.32.The median BMI is 27.32, indicating that half of the individuals have a BMI below this value.
BMI values range from 10.01 to 95.69.

**HbA1c** : HbA1c levels range from 3.50 to 9.00.The median HbA1c level is 5.80.

**Blood Glucose Level** :The mean blood glucose level is 138.06 mg/dL. The median blood glucose level is 140 mg/dL.
Blood glucose levels range from 80 to 300 mg/dL.

**Diabetes**: Approximately 8.5% of individuals in the dataset have diabetes.

# Correlation Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(14,9))
corr=df.corr()
sns.heatmap(corr,annot=True,cmap="YlOrBr",linewidths=0.6,fmt=".1f",linecolor="black")
plt.show()

In [None]:
corr_pairs=corr.unstack()
corr_pairs=corr_pairs[corr_pairs!=1].dropna()
corr_pairs=corr_pairs.sort_values(ascending=False)
top_correlated_pairs=corr_pairs.head(10)
low_correlated_pairs=corr_pairs.tail(10)
top_correlated_pairs

In [None]:
low_correlated_pairs

### Key Insights
**Strongest Correlations:**

**Diabetes and Blood Glucose Level:** There is a moderate positive correlation (approximately 0.4), indicating that as blood glucose levels increase, the likelihood of diabetes also increases.
Diabetes and HbA1c Level: There is a moderate positive correlation (approximately 0.4), suggesting that higher HbA1c levels are associated with a higher likelihood of diabetes.
Age and BMI: There is a moderate positive correlation (approximately 0.3), suggesting that older individuals tend to have higher BMI.

**Moderate Correlations:**

**Hypertension and Age:** There is a positive correlation (approximately 0.2), indicating that older individuals are more likely to have hypertension.
Heart Disease and Age: There is a positive correlation (approximately 0.2), suggesting that older individuals are more likely to have heart disease.
Smoking History (current) and Diabetes: There is a weak negative correlation (approximately -0.1 to -0.2), indicating that current smokers are slightly less likely to have diabetes.

**Weak Correlations:**

**Smoking History (various categories) and Diabetes:** Most categories of smoking history have very weak or no correlation with diabetes. This could indicate that smoking history, in this dataset, is not a strong predictor of diabetes.

**Gender and Hypertension:** There is a weak positive correlation (approximately 0.2), suggesting a slight gender difference in hypertension prevalence.
Gender and Heart Disease: There is a weak positive correlation (approximately 0.1), indicating a slight gender difference in heart disease prevalence.

**Negative Correlations:**

**Smoking History (current) and HbA1c Level:** There is a weak negative correlation, suggesting that current smokers tend to have slightly lower HbA1c levels.

**Smoking History (No Info) and Age:** There is a moderate negative correlation (approximately -0.3), indicating that individuals without smoking history information tend to be younger.

# Classification Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,confusion_matrix

In [None]:
X=df.drop(columns=['diabetes'])
y=df['diabetes']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)

## Logistic Regression

In [None]:
model=LogisticRegression()
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

In [None]:
print(f'accuracy: {accuracy}')
print(f'f1 score: {f1}')
print(f'roc_auc: {roc_auc}')
print(f'confusuion matrix: \n{confusion}')


## AdaBoost Classifier

In [None]:
adb = AdaBoostClassifier()
adb_model = adb.fit(X_train,y_train)

In [None]:
y_adb_pred=model.predict(X_test)

In [None]:
adb_accuracy = accuracy_score(y_test, y_adb_pred)
adb_f1 = f1_score(y_test, y_adb_pred)
adb_roc_auc = roc_auc_score(y_test, y_adb_pred)
adb_confusion = confusion_matrix(y_test, y_adb_pred)

In [None]:
print(f'accuracy: {adb_accuracy}')
print(f'f1 score: {adb_f1}')
print(f'roc_auc: {adb_roc_auc}')
print(f'confusuion matrix: \n{adb_confusion}')

## RandomForest Classifier

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_rf_pred = rf.predict(X_test)

In [None]:
rf_accuracy = accuracy_score(y_test, y_rf_pred)
rf_f1 = f1_score(y_test, y_rf_pred)
rf_roc_auc = roc_auc_score(y_test, y_rf_pred)
rf_confusion = confusion_matrix(y_test, y_rf_pred)

In [None]:
print(f'accuracy: {rf_accuracy}')
print(f'f1 score: {rf_f1}')
print(f'roc_auc: {rf_roc_auc}')
print(f'confusuion matrix: \n{rf_confusion}')


## SVM

In [None]:
from sklearn.svm import SVC

svc = SVC(probability=True, random_state=42)
svc.fit(X_train, y_train)
y_svc_pred = svc.predict(X_test)

In [None]:
svc_accuracy = accuracy_score(y_test, y_svc_pred)
svc_f1 = f1_score(y_test, y_svc_pred)
svc_roc_auc = roc_auc_score(y_test, y_svc_pred)
svc_confusion = confusion_matrix(y_test, y_svc_pred)

print(f'accuracy: {svc_accuracy}')
print(f'f1 score: {svc_f1}')
print(f'roc_auc: {svc_roc_auc}')
print(f'confusuion matrix: \n{svc_confusion}')


# Trend Analysis

In [None]:
pst_cases = df[df['diabetes'] == 1]
diabetes_year_counts = pst_cases['year'].value_counts().sort_index()
diabetes_year_counts

In [None]:
plt.figure(figsize=(12, 8))
sns.barplot(x=diabetes_year_counts.index, y=diabetes_year_counts.values, palette="hls")
plt.title("Diabetic Cases Over the Years")
plt.xlabel("Year")
plt.ylabel("Number of Diabetic Cases")
plt.show()

### Key insights
- The plot shows how the prevalence of diabetes has changed over the years worldwide.
- The diabetic patients in year 2015 and 2016 are relatively same in number with 759 and 765 cases.
- In year 2018 their is a sharp decline in diabetic patient.
- In year 2019, their is significant spike in the prevalence of diabetes acses as compared to the previous years.
- The prevelance of diabetes appers to drop dramatically, almost to negligible numbers in year 2020-2022.This sparse data suggest that their might be gaps in data collection maybe due to COVID-19 pandemic.

In [None]:
trends=pst_cases.groupby('year')[['blood_glucose_level','age','hbA1c_level']].mean()
trends

In [None]:
plt.figure(figsize=(15, 8))

plt.subplot(2, 2, 1)
sns.lineplot(data=trends, x=trends.index, y='age')
plt.title("Average Age of Diabetic Cases Over the Years")

plt.subplot(2, 2, 2)
sns.lineplot(data=trends, x=trends.index, y='blood_glucose_level')
plt.title("Average Blood Glucose Level of Diabetic Cases Over the Years")

plt.subplot(2, 2, 3)
sns.lineplot(data=trends, x=trends.index, y='hbA1c_level')
plt.title("Average HbA1c Level of Diabetic Cases Over the Years")

plt.tight_layout()
plt.show()

## **Average Age of Diabetic Cases**
**Trend:** The average age of diabetic cases shows some fluctuations over the years. The average age started around 83 years in 2015, dropped to about 81 years in 2016, then gradually increased, peaking around 88 years in 2020, before dropping again.

## **Average Blood Glucose Level of Diabetic Cases**
**Trend:** The average blood glucose level of diabetic cases shows a noticeable spike.From 2015 to 2018, the average blood glucose level remained relatively stable around 190-200 mg/dL. However, there was a sharp increase in 2019, peaking at around 240 mg/dL, followed by a decrease in 2020 and 2021, then a slight increase again in 2022.

## **Average HbA1c Level of Diabetic Cases**
**Trend:** The average HbA1c level also shows fluctuations over the years.The average HbA1c level was relatively stable from 2015 to 2017 around 7.0%. It then showed some variations, with a peak in 2020 and a dip in 2021, followed by another peak in 2022.