Downloading the dataset...

In [1]:
import os
import urllib.request
import zipfile
import shutil

# URL of the dataset
url = "https://archive.ics.uci.edu/static/public/529/early+stage+diabetes+risk+prediction+dataset.zip"

# Directory to save the dataset
save_dir = "dataset"
zip_path = os.path.join(save_dir, "diabetes_dataset.zip")

# Create the directory if it doesn't exist
os.makedirs(save_dir, exist_ok=True)

try:
    # Download the dataset
    print("Downloading dataset...")
    urllib.request.urlretrieve(url, zip_path)
    print("Download complete.")

    # Unzip the dataset
    print("Unzipping dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(save_dir)
    print("Unzipping complete.")

except urllib.error.URLError as e:
    print(f"Failed to download the dataset. URL error: {e}")
except zipfile.BadZipFile as e:
    print(f"Failed to unzip the dataset. Bad zip file error: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
finally:
    # Clean up: remove the zip file
    if os.path.exists(zip_path):
        os.remove(zip_path)
        print("Clean-up complete: zip file removed.")

    # Check if unzipped files exist
    if os.path.exists(save_dir):
        print(f"Dataset is available in the directory: {save_dir}")
    else:
        print("Unzipping failed, dataset not found.")


Downloading dataset...
Download complete.
Unzipping dataset...
Unzipping complete.
Clean-up complete: zip file removed.
Dataset is available in the directory: dataset


In [6]:
## Libraries

# EDA
import numpy as np
import pandas as pd

# vis
import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px

# ML 
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


## ML metric
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Ignore warning
import warnings
warnings.filterwarnings("ignore")


Descriptive analysis

In [None]:
df = pd.read_csv(