In [4]:
# etl_transform.ipynb

import pandas as pd
from datetime import datetime
from IPython.display import display

# -----------------------------
# Transform Function
# -----------------------------
def transform_data(df, output_path):
    if df.empty:
        print("No data to transform.")
        return df

    # Cleaning
    df.drop_duplicates(inplace=True)
    df.fillna(0, inplace=True)

    # Enrichment
    df['Attendance_Rate'] = (df['Present'] / df['Enrolled']) * 100

    # Structural
    df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d', errors='coerce')
    df['School DBN'] = df['School DBN'].str.upper()

    df.to_csv(output_path, index=False)
    print(f"Transformed data saved to {output_path}")
    return df

# -----------------------------
# Transform Full Data
# -----------------------------
try:
    df_full = pd.read_csv("school_attendance.csv", parse_dates=["last_updated"])
    transformed_full = transform_data(df_full, "transformed_full.csv")
    display(transformed_full.head())
except Exception as e:
    print(f"Failed to transform full data: {str(e)}")

# -----------------------------
# Transform Incremental Data
# -----------------------------
try:
    df_incremental = pd.read_csv("latest_extracted_records.csv", parse_dates=["last_updated"])
    transformed_incremental = transform_data(df_incremental, "transformed_incremental.csv")
    display(transformed_incremental.head())
except Exception as e:
    print(f"Failed to transform incremental data: {str(e)}")


Transformed data saved to transformed_full.csv


Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released,last_updated,Attendance_Rate
0,01M015,2018-09-03,153,15,138,0,2018-09-03 21:00:00,90.196078
1,02M394,2018-09-03,160,12,148,0,2018-09-03 23:00:00,92.5
2,03K403,2018-09-03,196,29,167,0,2018-09-03 18:00:00,85.204082
3,04M409,2018-09-03,174,18,156,0,2018-09-03 17:00:00,89.655172
4,05M280,2018-09-03,197,8,189,0,2018-09-03 04:00:00,95.939086


Transformed data saved to transformed_incremental.csv


Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released,last_updated,Attendance_Rate
0,05M280,NaT,165,18,147,0,2018-11-30 23:00:00,89.090909
