# EDA

In [1]:
import os
import pandas as pd

In [2]:
data_folder = "../Datasets/HUPA-UCM Diabetes Dataset/Preprocessed"
output_folder = "../Datasets/HUPA-UCM Diabetes Dataset/Cleaned"

In [3]:
def show_shape(df):
    print("\n Shape (rows, cols):", df.shape)

def show_columns(df):
    print("\n Columns:", df.columns.tolist())

def show_head_tail(df):
    print("\n Head:")
    print(df.head())
    print("\n Tail:")
    print(df.tail())

def show_info(df):
    print("\n Info:")
    print(df.info())

def check_missing(df):
    print("\n Missing values:")
    print(df.isnull().sum())

def show_describe(df):
    print("\n Describe (summary stats):")
    print(df.describe(include="all"))

def check_duplicates(df):
    print("\n Duplicates:", df.duplicated().sum())

def check_outliers(df):
    print("\n Possible outliers (values above 99th percentile):")
    for col in df.select_dtypes(include=["float64", "int64"]).columns:
        high = df[col].quantile(0.99)
        print(f"{col}: >{high}")

    

In [18]:
# ---- Generic runner for one file ----
def run_eda_on_file(path, file_name, funcs):
    print("*"*60)
    print(f"File: {file_name}")
    df = pd.read_csv(path, sep=";")

    # apply only the functions you pass
    for func in funcs:
        func(df)

In [21]:
# ---- Runner for all files ----
def run_eda_on_folder(data_folder, funcs):
    if os.path.exists(data_folder):
        csv_files = [file for file in os.listdir(data_folder) if file.endswith(".csv")]
        print("Found CSV files:", csv_files)
        print('*'*20)

        for file in csv_files:
            path = os.path.join(data_folder, file)
            run_eda_on_file(path, file, funcs)
            print('*'*20)
    else:
        print("Folder not found:", data_folder)

In [22]:
data_folder = "../Datasets/HUPA-UCM Diabetes Dataset/Preprocessed"


In [23]:
run_eda_on_folder(data_folder, funcs=[show_info])



Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3181 entries, 0 to 3180
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   time                    3181 non-null   object 
 1   glucose                 3181 non-null   float64
 2   calories                3181 non-null   float64
 3   heart_rate              3181 non-null   float64
 4   steps             

In [24]:
run_eda_on_folder(data_folder, funcs=[show_shape])

Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Shape (rows, cols): (3181, 8)
********************
************************************************************
File: HUPA0027P.csv

 Shape (rows, cols): (165306, 8)
********************
************************************************************
File: HUPA0020P.csv

 Shape (rows, cols): (2862, 8)
********************
************************************************************
File: HUPA0011P.csv

 Shape (rows, cols): (3839, 8)
*******************

In [26]:
# or run multiple
run_eda_on_folder(data_folder, funcs=[check_missing])

Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Missing values:
time                      0
glucose                   0
calories                  0
heart_rate                0
steps                     0
basal_rate                0
bolus_volume_delivered    0
carb_input                0
dtype: int64
********************
************************************************************
File: HUPA0027P.csv

 Missing values:
time                      0
glucose                   0
calories                

In [27]:
# or run multiple
run_eda_on_folder(data_folder, funcs=[check_duplicates])

Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Duplicates: 0
********************
************************************************************
File: HUPA0027P.csv

 Duplicates: 0
********************
************************************************************
File: HUPA0020P.csv

 Duplicates: 0
********************
************************************************************
File: HUPA0011P.csv

 Duplicates: 0
********************
************************************************************
Fil

In [28]:
run_eda_on_folder(data_folder, funcs=[check_outliers])

Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Possible outliers (values above 99th percentile):
glucose: >283.0
calories: >39.08627204895011
heart_rate: >102.80462519936198
steps: >486.7999999999993
basal_rate: >0.1416666666666666
bolus_volume_delivered: >3.4199999999999817
carb_input: >0.0
********************
************************************************************
File: HUPA0027P.csv

 Possible outliers (values above 99th percentile):
glucose: >266.6500000000039
calories: >42.34640026092