In [17]:
# WPV Data Cleaning Notebook - For Lily

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

In [18]:
# Load raw data files directly from the prepare folder
df_phase1 = pd.read_excel("../../prepare/WVP Data Collection - Phase I - M.xlsm", sheet_name=0, skiprows=1)
df_december = pd.read_excel("../../prepare/MH December 2024 Data.xlsx", sheet_name="MHA WPV Data Export December 20", header=4)
df_aug_oct = pd.read_excel("../../prepare/MHA WPV events August through October 2024.xlsx", sheet_name="MHA WPV events August through O", header=1)

# Add source labels
df_phase1['source'] = 'phase1'
df_december['source'] = 'december'
df_aug_oct['source'] = 'aug_oct'

In [19]:
# ========== Mapping Dictionary ==========
map_common = {
    'Event Date': 'event_time',
    'Occupational Category of Person Affected': 'victim_profession',
    'Department/Office Incident Took Place': 'department',
    'Aggressor': 'perpetrator_type',
    'Type of Violence': 'violence_type',
    'Severity of Assault': 'severity',
    'Emotional and/ or Psychological Impact': 'emotional_impact',
    'Level of Care Needed': 'physical_injury_level',
    'Primary Assault Description': 'assault_desc',
    'Assault Description1': 'assault_detail',
    'Response Action Taken': 'response_action',
    'Primary Contributing Factors': 'contributing_factors',
    'Incident #': 'incident_id'
}

map_aug_oct = {
    'victim_profession': 'victim_profession',
    'victim_primary_job': 'victim_primary_job',
    'event_time': 'event_time',
    'violence_type': 'violence_type',
    'contributing_factors': 'contributing_factors',
    'emotional_impact': 'emotional_impact',
    'severity': 'severity',
    'severity_level': 'severity_level',
    'department': 'department',
    'response_action': 'response_action',
    'perpetrator_type': 'perpetrator_type',
    'assault_desc': 'assault_desc',
    'osha_recordable': 'osha_recordable',
    'days_missed': 'days_missed'
}

In [20]:
# ========== Preprocessing ==========
df_phase1['victim_primary_job'] = df_phase1['Occupational Category of Person Affected'].astype(str).str.split(',').str[0].str.strip()
df_december['victim_primary_job'] = df_december['Occupational Category of Person Affected'].astype(str).str.split(',').str[0].str.strip()

# Optional severity normalization function
def standardize_severity(value):
    if pd.isna(value):
        return 'Unknown'
    value = str(value).lower()
    if 'none' in value:
        return 'None'
    elif 'mild' in value:
        return 'Mild'
    elif 'moderate' in value:
        return 'Moderate'
    elif 'severe' in value:
        return 'Severe'
    else:
        return 'Unknown'

In [21]:
# Apply severity mapping if necessary
df_phase1['severity_level'] = df_phase1['Severity of Assault'].apply(standardize_severity)
df_december['severity_level'] = df_december['Severity of Assault'].apply(standardize_severity)

In [22]:
# ========== Rename columns ==========
df_phase1 = df_phase1.rename(columns=map_common)
df_december = df_december.rename(columns=map_common)
df_aug_oct = df_aug_oct.rename(columns=map_aug_oct)

In [23]:
# ========== Combine All ==========
df_all = pd.concat([df_phase1, df_december, df_aug_oct], ignore_index=True)

In [16]:
# ========== Save Final Merged Dataset ==========
df_all.to_csv("../../cleaned_data/lily/merged_wpv_cleaned.csv", index=False)
print("✅ Cleaned & merged WPV dataset saved for Lily from original prepare files.")

✅ Cleaned & merged WPV dataset saved for Lily from original prepare files.
