# Task 2 – Data Enrichment

This notebook performs data enrichment on the cleaned dataset `combined_outputs.csv` for CIT5900 Project 3.

In [None]:
import pandas as pd
import os

In [None]:
# Step 1: Load CSV with mixed types handled
combined_df = pd.read_csv('combined_outputs.csv', dtype=str)  # Safe for enrichment
print(f"Loaded {combined_df.shape[0]} records.")

In [None]:
# Step 2: Clean column names
combined_df.columns = combined_df.columns.str.strip()

In [None]:
# Step 3: Ensure all required columns exist
required_columns = [
    'ProjID', 'ProjectStatus', 'ProjectTitle', 'ProjectRDC', 'ProjectYearStarted',
    'ProjectYearEnded', 'ProjectPI', 'OutputTitle', 'OutputBiblio', 'OutputType',
    'OutputStatus', 'OutputVenue', 'OutputYear', 'OutputMonth', 'OutputVolume',
    'OutputNumber', 'OutputPages'
]

for col in required_columns:
    if col not in combined_df.columns:
        combined_df[col] = ""  # Add empty column if missing


In [None]:
# Step 4: Sample enrichment
combined_df['ProjectStatus'] = combined_df['ProjectStatus'].replace("", "Completed")
combined_df['OutputStatus'] = combined_df['OutputStatus'].replace("", "Published")
combined_df['OutputType'] = combined_df['OutputType'].replace("", "Unknown")

In [None]:
# Step 5: Convert OutputYear and OutputMonth safely
combined_df['OutputYear'] = pd.to_datetime(combined_df['OutputYear'], errors='coerce').dt.year
combined_df['OutputMonth'] = pd.to_datetime(combined_df['OutputMonth'], errors='coerce').dt.month

In [None]:
# Step 6: Clean up string columns
combined_df = combined_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
# Step 7: Save
os.makedirs('./data/processed', exist_ok=True)
combined_df.to_csv('./data/processed/enriched_outputs_sample.csv', index=False)
print("Task 2 completed: Enriched CSV saved.")