In [1]:

import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from io import StringIO, BytesIO

# Add the parent directory to the system path to allow importing from dat-ingestion_bridge
sys.path.insert(0, str(Path.cwd().parent))

# Explicitly import from the correct path
from src.ingestion_utils import process_gym_data, IngestionMetadata

# Create a temporary directory for test files
if not os.path.exists("temp_data"):
    os.makedirs("temp_data")

print("Setup complete. Ready to demonstrate data ingestion and normalization.")


Setup complete. Ready to demonstrate data ingestion and normalization.


### Possible Data Formats

In [2]:
# Scenario 1: CSV with inconsistent delimiters and encoding issues
csv_content_bad_delimiter = """
id;name;age;start_date
1;Alice;30;2022-01-15
2,Bob,24,16/03/2021
3;Charlie;35;2020-11-01
4,David,29,05-07-2023
"""
csv_path_bad_delimiter = "temp_data/bad_delimiter_data.csv"

with open(csv_path_bad_delimiter, "w", encoding="latin1") as f:
    f.write(csv_content_bad_delimiter)


# Scenario 2
excel_path_junk_empty = "temp_data/junk_empty_data.xlsx"
data_excel = {
    'MemberID': ['M001', 'M002', 'M003', 'M004', 'M005', None, 'M007'],
    'Name': ['Eve', 'Frank', 'Grace', None, 'Heidi', 'Ivy', 'Jack'],
    'MembershipType': ['Gold', 'Silver', 'Gold', 'Bronze', 'Silver', 'Gold', 'Bronze'],
    'LastVisit': ['2023-01-10', '15/02/2023', '2023-03-20', 'invalid-date', '2023-05-01', None, '2023-06-11'],
    'Fee': [100.50, 75, 100.50, 50, 75, 100.50, 50],
    'IsActive': ['YES', 'No', '1', '0', 'true', 'FALSE', None],
    'Junk1': [None, None, None, None, None, None, None], # Empty column
    'Junk2': ['garbage', 'garbage', 'garbage', 'garbage', 'garbage', 'garbage', 'garbage']
}
df_excel_raw = pd.DataFrame(data_excel)
df_excel_raw.loc[len(df_excel_raw)] = [None] * len(df_excel_raw.columns)

with pd.ExcelWriter(excel_path_junk_empty, engine='openpyxl') as writer:
    df_excel_raw.to_excel(writer, index=False, sheet_name='Sheet1')


# Scenario 3: TSV with long format and missing values
tsv_content_long_missing = """
member_id\tattribute\tvalue
101\tgender\tMale
101\tage\t28
102\tgender\tFemale
102\tage\t34
103\tgender\tNone
103\tage\t30
104\tgender\tMale
104\tage\t
105\tgender\tFemale
105\tage\t22
"""
tsv_path_long_missing = "temp_data/long_missing_data.tsv"
with open(tsv_path_long_missing, "w", encoding="utf-8") as f:
    f.write(tsv_content_long_missing)


# Scenario 4: CSV with mixed types, varying date formats, and partial corruption
csv_content_mixed_corrupt = """
OrderID,Customer,OrderDate,Amount,Status,DeliveryDate
1001,John Doe,2023-01-01,150.75,Completed,2023-01-05
1002,Jane Smith,02/01/2023,abc,Pending,06/01/2023
1003,Peter Jones,2023-Mar-03,200.00,Completed,2023-03-07
1004,Alice Brown,04.04.2023,75.20,Cancelled,invalid-date
1005,Bob White,2023/05/05,120,Completed,2023-May-09
1006,Charlie Green,06-Jun-2023,300.00,Pending,10-Jun-2023
1007,Diana Prince,07/Jul/2023,100,Completed,11/07/2023
1008,Eve Black,invalid-date,50.00,Pending,2023-08-15
"""
csv_path_mixed_corrupt = "temp_data/mixed_corrupt_data.csv"
with open(csv_path_mixed_corrupt, "w", encoding="utf-8") as f:
    f.write(csv_content_mixed_corrupt)

  df_excel_raw.loc[len(df_excel_raw)] = [None] * len(df_excel_raw.columns)


### Scenario 1

In [3]:
# Scenario 1: CSV with inconsistent delimiters and encoding issues

df_processed, metadata = process_gym_data(csv_path_bad_delimiter)

print("\n--- Processed DataFrame (Bad Delimiter) ---")
print(df_processed.head())
print("\n--- Metadata (Bad Delimiter) ---")
print(metadata)


--- Processed DataFrame (Bad Delimiter) ---
                      id     name   age  start_date
0                      1    Alice  30.0  2022-01-15
1    2,Bob,24,16/03/2021     <NA>  <NA>        <NA>
2                      3  Charlie  35.0  2020-11-01
3  4,David,29,05-07-2023     <NA>  <NA>        <NA>

--- Metadata (Bad Delimiter) ---


  datetime_series = pd.to_datetime(df_coerced[col], errors='coerce', dayfirst=True)


### Scenario 2

In [4]:
# Scenario 2: Excel file with junk columns, empty rows, and mixed data types

df_processed_excel, metadata_excel = process_gym_data(excel_path_junk_empty)

print("\n--- Processed DataFrame (Junk Columns/Empty Rows) ---")
print(df_processed_excel.head(8))
print("\n--- Metadata (Junk Columns/Empty Rows) ---")
print(metadata_excel)


--- Processed DataFrame (Junk Columns/Empty Rows) ---
  MemberID   Name MembershipType     LastVisit    Fee  IsActive    Junk2
0     M001    Eve           Gold    2023-01-10  100.5      True  garbage
1     M002  Frank         Silver    15/02/2023   75.0     False  garbage
2     M003  Grace           Gold    2023-03-20  100.5      True  garbage
3     M004   <NA>         Bronze  invalid-date   50.0     False  garbage
4     M005  Heidi         Silver    2023-05-01   75.0      True  garbage
5     <NA>    Ivy           Gold          <NA>  100.5     False  garbage
6     M007   Jack         Bronze    2023-06-11   50.0      <NA>  garbage

--- Metadata (Junk Columns/Empty Rows) ---


### Scenario 3

In [5]:
# Scenario 3

df_processed_tsv, metadata_tsv = process_gym_data(tsv_path_long_missing)

print("\n--- Processed DataFrame (Long Format, Missing Values) ---")
print(df_processed_tsv.head(8))
print("\n--- Metadata (Long Format, Missing Values) ---")
print(metadata_tsv)


--- Processed DataFrame (Long Format, Missing Values) ---
   member_id attribute   value
0        101    gender    Male
1        101       age      28
2        102    gender  Female
3        102       age      34
4        103    gender    <NA>
5        103       age      30
6        104    gender    Male
7        104       age    <NA>

--- Metadata (Long Format, Missing Values) ---


### Scenario 4

In [6]:
# Scenario 4

df_processed_mixed, metadata_mixed = process_gym_data(csv_path_mixed_corrupt)

print("\n--- Processed DataFrame (Mixed Types, Corrupt Data) ---")
print(df_processed_mixed.head(8))
print("\n--- Metadata (Mixed Types, Corrupt Data) ---")
print(metadata_mixed)


--- Processed DataFrame (Mixed Types, Corrupt Data) ---
   OrderID       Customer     OrderDate  Amount     Status  DeliveryDate
0     1001       John Doe    2023-01-01  150.75  Completed    2023-01-05
1     1002     Jane Smith    02/01/2023    <NA>    Pending    06/01/2023
2     1003    Peter Jones   2023-Mar-03   200.0  Completed    2023-03-07
3     1004    Alice Brown    04.04.2023    75.2  Cancelled  invalid-date
4     1005      Bob White    2023/05/05   120.0  Completed   2023-May-09
5     1006  Charlie Green   06-Jun-2023   300.0    Pending   10-Jun-2023
6     1007   Diana Prince   07/Jul/2023   100.0  Completed    11/07/2023
7     1008      Eve Black  invalid-date    50.0    Pending    2023-08-15

--- Metadata (Mixed Types, Corrupt Data) ---
