In [29]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
data_folder = "../../Datasets/HUPA-UCM Diabetes Dataset/Preprocessed"
output_folder = "../../Datasets/HUPA-UCM Diabetes Dataset/Cleaned"
eda_output_folder = "../../Datasets/HUPA-UCM Diabetes Dataset/EDA_Outputs"

In [31]:
os.makedirs(output_folder, exist_ok=True)
os.makedirs(eda_output_folder, exist_ok=True)

In [32]:
sys.path.append("../../src")  

from utils.hupa_ucm.hupa_ucm_checks import (
    show_info,
    show_shape,
    check_missing,
    check_duplicates,
    check_outliers
)

from eda.hupa_ucm.hupa_ucm_plots import (
    plot_outliers,
    plot_distributions
)

from eda.hupa_ucm.hupa_ucm_runner import run_eda_on_folder

In [33]:
# Run basic checks
print(" BASIC INFO ")
run_eda_on_folder(data_folder, funcs=[show_info])

print("\nSHAPE ")
run_eda_on_folder(data_folder, funcs=[show_shape])

print("\MISSING VALUES ")
run_eda_on_folder(data_folder, funcs=[check_missing])

print("\nDUPLICATES ")
run_eda_on_folder(data_folder, funcs=[check_duplicates])

print("\n OUTLIERS ")
run_eda_on_folder(data_folder, funcs=[check_outliers])


=== BASIC INFO ===
Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3181 entries, 0 to 3180
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   time                    3181 non-null   object 
 1   glucose                 3181 non-null   float64
 2   calories                3181 non-null   float64
 3   heart_rate              3181 non-null   float64
 4  

In [36]:
# Generate plots
outliers_folder = os.path.join(eda_output_folder, "outliers")
distributions_folder = os.path.join(eda_output_folder, "distributions")

os.makedirs(outliers_folder, exist_ok=True)
os.makedirs(distributions_folder, exist_ok=True)

print("\n GENERATING OUTLIER PLOTS ")
run_eda_on_folder(
    data_folder,
    funcs=[lambda df, file_name: plot_outliers(df, file_name, output_folder=outliers_folder)]
)

print("\nGENERATING DISTRIBUTION PLOTS ")
run_eda_on_folder(
    data_folder,
    funcs=[lambda df, file_name: plot_distributions(df, file_name, output_folder=distributions_folder)]
)



 GENERATING OUTLIER PLOTS 
Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv
Saved outlier plots for HUPA0002P.csv in ../../Datasets/HUPA-UCM Diabetes Dataset/EDA_Outputs/outliers/HUPA0002P
********************
************************************************************
File: HUPA0027P.csv
Saved outlier plots for HUPA0027P.csv in ../../Datasets/HUPA-UCM Diabetes Dataset/EDA_Outputs/outliers/HUPA0027P
********************
************************************************************
File: HUPA0020P.c

In [35]:
print("\n COMPREHENSIVE CHECK ")
run_eda_on_folder(data_folder, funcs=[
    show_shape,
    check_missing,
    check_duplicates,
    check_outliers
])



=== COMPREHENSIVE CHECK ===
Found CSV files: ['HUPA0002P.csv', 'HUPA0027P.csv', 'HUPA0020P.csv', 'HUPA0011P.csv', 'HUPA0007P.csv', 'HUPA0018P.csv', 'HUPA0028P.csv', 'HUPA0003P.csv', 'HUPA0022P.csv', 'HUPA0026P.csv', 'HUPA0025P.csv', 'HUPA0023P.csv', 'HUPA0005P.csv', 'HUPA0014P.csv', 'HUPA0001P.csv', 'HUPA0006P.csv', 'HUPA0010P.csv', 'HUPA0016P.csv', 'HUPA0021P.csv', 'HUPA0017P.csv', 'HUPA0024P.csv', 'HUPA0004P.csv', 'HUPA0009P.csv', 'HUPA0015P.csv', 'HUPA0019P.csv']
********************
************************************************************
File: HUPA0002P.csv

 Shape (rows, cols): (3181, 8)

 Missing values:
time                      0
glucose                   0
calories                  0
heart_rate                0
steps                     0
basal_rate                0
bolus_volume_delivered    0
carb_input                0
dtype: int64

 Duplicates: 0

 Possible outliers (values above 99th percentile):
glucose: >283.0
calories: >39.08627204895011
heart_rate: >102.804625199