In [12]:
# -------------------------------
# Stage 1: Business Analyst Task
# -------------------------------
# Business Question: What are the top 5 products by revenue in the last quarter, and how does customer sentiment vary for these products?
# Required data points: product_id, sale_price, quantity, sale_date, customer_id, sentiment_score

# -------------------------------
# Stage 2: Data Engineer Task
# -------------------------------

import pandas as pd
import matplotlib.pyplot as plt
import os

# Ensure directory structure
os.makedirs("data_warehouse", exist_ok=True)

patients_df = pd.read_csv("./raw_data/patients_data_with_doctor.csv")
doctors_df = pd.read_csv("./raw_data/doctors_info.csv")
feedback_df = pd.read_json("./raw_data/patient_feedback.json")

# Normalize column names
patients_df.columns = patients_df.columns.str.strip().str.lower()
doctors_df.columns = doctors_df.columns.str.strip().str.lower()
feedback_df.columns = feedback_df.columns.str.strip().str.lower()

# Convert review_date to datetime
feedback_df['review_date'] = pd.to_datetime(feedback_df['review_date'], errors='coerce')

# Drop duplicates if any
patients_df.drop_duplicates(inplace=True)
doctors_df.drop_duplicates(inplace=True)
feedback_df.drop_duplicates(inplace=True)

# Drop rows missing keys
patients_df.dropna(subset=['doctor_id'], inplace=True)
feedback_df.dropna(subset=['patient_id', 'patient_feedback_score'], inplace=True)

# Merge patients with doctors
patient_doctor_df = pd.merge(
    patients_df,
    doctors_df,
    on='doctor_id',
    how='left'
)

# Merge with feedback
final_df = pd.merge(
    patient_doctor_df,
    feedback_df,
    on='patient_id',
    how='left'
)

# Save cleaned dataset
final_df.to_csv("cleaned_healthcare_data.csv", index=False)

print("ETL complete. Cleaned data saved to 'cleaned_healthcare_data.csv'")


ETL complete. Cleaned data saved to 'cleaned_healthcare_data.csv'
