In [9]:
import pandas as pd
from google.cloud import bigquery
import os

# Set path to your service account key JSON
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\Phython\ca-hospital-project-23db597e2610.json"

# Initialize BigQuery client
client = bigquery.Client()

# SQL query to retrieve the patients table
query = """
    SELECT * FROM `ca-hospital-project.CA_Hospital_Project.patients`
"""
df = client.query(query).to_dataframe()

# Create a copy for anonymization
df_anonymized = df.copy()

# Mask first and last name with initials
df_anonymized['first_name'] = df['first_name'].apply(lambda x: x[0] + '****' if pd.notnull(x) else x)
df_anonymized['last_name'] = df['last_name'].apply(lambda x: x[0] + '****' if pd.notnull(x) else x)

# Convert DOB to age groups and drop DOB & age
df_anonymized['age_group'] = pd.cut(df['age'], bins=[0, 20, 30, 40, 50, 60, 70, 120],
                                    labels=['0-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71+'])
df_anonymized.drop(columns=['dob', 'age'], inplace=True)

# Mask email addresses
if 'email' in df.columns:
    df_anonymized['email'] = df['email'].apply(lambda x: x[:2] + '****@***.com' if pd.notnull(x) and len(x) > 5 else x)

# Mask phone numbers
if 'phone' in df.columns:
    df_anonymized['phone'] = df['phone'].apply(lambda x: '****' + x[-3:] if pd.notnull(x) and len(x) >= 3 else x)

# Truncate ZIP code
if 'zip' in df.columns:
    df_anonymized['zip'] = df['zip'].apply(lambda x: int(str(x)[:3]) if pd.notnull(x) else x)

# Drop detailed address
if 'address' in df.columns:
    df_anonymized.drop(columns=['address'], inplace=True)

# Save the anonymized dataset locally
output_path = "anonymized_patients.csv"
df_anonymized.to_csv(output_path, index=False)

print(f"✅ Anonymized file saved successfully as: {output_path}")

  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:\Phython\ca-hospital-project-23db597e2610.json"


✅ Anonymized file saved successfully as: anonymized_patients.csv
