In [13]:
df = pd.read_csv(
        "school_attendance.csv",
        parse_dates=["last_updated"],
        converters={"Date": lambda x: datetime.strptime(x, "%Y%m%d")}
    )
print(f" Full extraction: Loaded ALL {len(df)} records.")
    # You can assign to a variable if needed, e.g. df_full = df
    # except Exception as e:
    # print(f" # school_attendance_etl.py


def generate_sample_data():
    database_file = 'school_attendance.csv'
    schools = ['01M015', '02M394', '03K403', '04M409', '05M280']
    data = []
    start_date = datetime(2018, 9, 1)

    for i in range(1, 91):  # 90 days of data
            date = start_date + timedelta(days=i)
            if date.weekday() >= 5:  # Skip weekends
                continue
                
            for school in schools:
                enrolled = random.randint(150, 200)
                absent = random.randint(5, 30)
                present = enrolled - absent
                data.append({
                    'School DBN': school,
                    'Date': date.date().strftime('%Y%m%d'),
                    'Enrolled': enrolled,
                    'Absent': absent,
                    'Present': present,
                    'Released': 0,
                    'last_updated': (date + timedelta(hours=random.randint(0, 23))).isoformat()
                })

    df = pd.DataFrame(data)
    df.to_csv('school_attendance.csv', index=False)
print("Sample data generated in school_attendance.csv")

 Full extraction: Loaded ALL 325 records.
Sample data generated in school_attendance.csv


In [14]:
import pandas as pd
import os

# SECTION 1: Setup
os.makedirs('loaded_data', exist_ok=True)

# Define paths
full_csv = 'school_attendance.csv'
inc_csv = 'latest_extracted_records.csv'
full_parquet = 'loaded_data/full_data.parquet'
inc_parquet = 'loaded_data/incremental_data.parquet'

# SECTION 2: Load Full
full_data = pd.read_csv(full_csv, parse_dates=["last_updated"])
full_data.to_parquet(full_parquet, index=False)
print("Full data saved as Parquet.")

# SECTION 3: Load Incremental
if os.path.exists(inc_csv):
    inc_data = pd.read_csv(inc_csv, parse_dates=["last_updated"])
    inc_data.to_parquet(inc_parquet, index=False)
    print("Incremental data saved as Parquet.")
else:
    print(" Incremental file not found.")

# SECTION 4: Verify
print("\nPreview of Full:")
display(pd.read_parquet(full_parquet).head())

if os.path.exists(inc_parquet):
    print("\nPreview of Incremental:")
    display(pd.read_parquet(inc_parquet).head())


Full data saved as Parquet.
Incremental data saved as Parquet.

Preview of Full:


Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released,last_updated
0,01M015,20180903,172,7,165,0,2018-09-03 18:00:00
1,02M394,20180903,200,9,191,0,2018-09-03 02:00:00
2,03K403,20180903,158,24,134,0,2018-09-03 14:00:00
3,04M409,20180903,183,24,159,0,2018-09-03 07:00:00
4,05M280,20180903,159,17,142,0,2018-09-03 20:00:00



Preview of Incremental:


Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released,last_updated
0,01M015,2018-11-30,167,22,145,0,2018-11-30 20:00:00
1,02M394,2018-11-30,189,21,168,0,2018-11-30 22:00:00
2,05M280,2018-11-30,163,12,151,0,2018-11-30 20:00:00
