In [None]:
import sys
import os
import pandas as pd
from pathlib import Path

# --- 1. Setup Paths (To find the 'etl' folder) ---
# Get current notebook directory
notebook_dir = Path(os.getcwd())

# If inside 'notebooks' folder, root is the parent
if notebook_dir.name == 'notebooks':
    project_root = notebook_dir.parent
    if str(project_root) not in sys.path:
        sys.path.append(str(project_root))
        print(f"Project root added to Path: {project_root}")

# --- 2. Import Our Functions ---
from etl.connection import load_raw_data
from etl.processor import process_data

# --- 3. Execute Pipeline (Step-by-Step) ---

print("\n>>> STEP 1: Downloading Raw Data (List of Months)...")
raw_list = load_raw_data()

if raw_list:
    print(f"✓ Success! You downloaded {len(raw_list)} dataframes (months).")
    print(f"  - Example Month 1 (Jan): {raw_list[0].shape} rows/columns")
    
    print("\n>>> STEP 2: Processing and Unifying...")
    df_final = process_data(raw_list)
    
    print(f"✓ Success! Final Dataframe created.")
    print(f"  - Final Shape: {df_final.shape} (Rows, Columns)")
    
    # --- 4. The Litmus Test (Check if duplicates are gone) ---
    print("\n>>> STEP 3: Quality Check (Ghost Rows)")
    print("Checking for duplicate or empty rows...")
    
    # Check nulls in score column
    nulos = df_final['score'].isna().sum()
    print(f"  - Total Null Scores (Rest/Off): {nulos}")
    
    # Check duplicates: There shouldn't be two rows for the same habit on the same date
    duplicatas = df_final.duplicated(subset=['date', 'habit'], keep=False).sum()
    if duplicatas == 0:
        print("  - ZERO Duplicates found! (Issue resolved)")
    else:
        print(f"  - WARNING: {duplicatas} duplicates found.")

else:
    print("ERROR: Data list came back empty. Check connection.")

In [None]:
display(df_final)