<a href="https://colab.research.google.com/github/TarekkMU1911/AI-Agent-Diabetes-Diagnosis/blob/Phase%2FPreprocessing%2Fdiabetes_prediction_dataset/Diabetes_dataset_preprocessing_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"faridakhaledaly","key":"0ff9c567c66f6538c82d86465cf0b231"}'}

#**Install Kaggle API**

In [2]:
!pip install -q kaggle

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle datasets list

ref                                                        title                                                     size  lastUpdated                 downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  --------------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
yashdevladdha/uber-ride-analytics-dashboard                Uber Data Analytics Dashboard                         17324552  2025-08-08 11:13:42.920000          38477        873  1.0              
mdsultanulislamovi/student-stress-monitoring-datasets      Student Stress Monitoring Datasets                       24336  2025-07-24 16:30:01.617000          21278        369  1.0              
vikasjigupta786/customer-analytics-practice-dataset        Customer Analytics Practice Dataset                       3079  2025-08-26 17:32:53.557000           1386         32  1.0              
ikramshah512/amazon-produ

# **Import Libraries**

In [5]:
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline


pd.options.display.float_format = "{:.2f}".format

# **Load and Unzip the Dataset**

In [6]:

!kaggle datasets download -d iammustafatz/diabetes-prediction-dataset

!unzip -q diabetes-prediction-dataset.zip

Dataset URL: https://www.kaggle.com/datasets/iammustafatz/diabetes-prediction-dataset
License(s): copyright-authors
Downloading diabetes-prediction-dataset.zip to /content
  0% 0.00/734k [00:00<?, ?B/s]
100% 734k/734k [00:00<00:00, 157MB/s]


In [7]:
df =  pd.read_csv('diabetes_prediction_dataset.csv')

In [8]:
df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


# **Handling Duplicates**

In [9]:
duplicate_rows_data = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_data.shape)

number of duplicate rows:  (3854, 9)


In [10]:
df = df.drop_duplicates()

# ***Check if there is any Missing Values***

In [11]:
print(df.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


# ***Check if there is rare (uneeded) Values***

In [12]:
'Other' in df['gender'].values

True

# **Remove this rare value**

In [13]:
df = df[df['gender'] != 'Other']

In [14]:
df.describe().style.background_gradient(cmap='Blues').format("{:.2f}")

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,96128.0,96128.0,96128.0,96128.0,96128.0,96128.0,96128.0
mean,41.8,0.08,0.04,27.32,5.53,138.22,0.09
std,22.46,0.27,0.2,6.77,1.07,40.91,0.28
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.4,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,59.0,0.0,0.0,29.86,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [15]:
df.shape

(96128, 9)

In [16]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


# ***Convert Rows into Readable Text Cases***

### **Instead of numeric rows, your LLaMA model expects natural language input**

In [17]:
def format_case(row):
    return (
        f"A {row['age']}-year-old {row['gender'].lower()} "
        f"with a BMI of {row['bmi']:.1f}, HbA1c level {row['HbA1c_level']:.1f}, and "
        f"blood glucose level {row['blood_glucose_level']:.0f}. "
        f"Smoking history: {row['smoking_history'].lower()}. "
        f"{'Has' if row['hypertension'] else 'No'} hypertension. "
        f"{'Has' if row['heart_disease'] else 'No'} heart disease."
    )

In [18]:
df["llm_prompt"] = df.apply(format_case, axis=1)

In [19]:
df.shape

(96128, 10)

In [20]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,llm_prompt
0,Female,80.0,0,1,never,25.19,6.6,140,0,"A 80.0-year-old female with a BMI of 25.2, HbA..."
1,Female,54.0,0,0,No Info,27.32,6.6,80,0,"A 54.0-year-old female with a BMI of 27.3, HbA..."
2,Male,28.0,0,0,never,27.32,5.7,158,0,"A 28.0-year-old male with a BMI of 27.3, HbA1c..."
3,Female,36.0,0,0,current,23.45,5.0,155,0,"A 36.0-year-old female with a BMI of 23.4, HbA..."
4,Male,76.0,1,1,current,20.14,4.8,155,0,"A 76.0-year-old male with a BMI of 20.1, HbA1c..."


# **Convert to JSON Format**

In [21]:
records = df[["llm_prompt", "diabetes"]].rename(columns={"llm_prompt": "input", "diabetes": "label"}).to_dict(orient="records")

In [22]:
import json
with open("diabetes_cases.json", "w") as f:
    json.dump(records, f, indent=2)

# ***Download the file***

In [23]:
files.download("diabetes_cases.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>