# Week 8 – Case Studies & Data Anonymization

In [None]:
# Install required packages
!pip install faker fairlearn --quiet
# matplotlib and pandas are preinstalled in Colab, but we import them below
print("Packages installed.")


In [None]:
# Generate a synthetic resumes dataset and save as CSV
import pandas as pd
from faker import Faker
import random

fake = Faker()
Faker_seed = 42
Faker.seed(Faker_seed)
random.seed(42)

def generate_synthetic_resume(i):
    gender = random.choice(['male','female'])
    # create gendered first names sometimes to mimic realistic signals
    name = fake.name_male() if gender=='male' else fake.name_female()
    email = name.lower().replace(' ', '.') + f"{i}@example.com"
    phone = fake.phone_number()
    education = random.choice(['BSc Computer Science', 'MSc Data Science', 'BSc Engineering', 'BA Economics'])
    years_exp = random.choice([1,2,3,4,5,6,7,8,10,12])
    # include a small text with possible gendered signals (to mimic proxies)
    extra_text = random.choice([
        "Leader of college football club",
        "Member of women's chess club" if gender=='female' else "Member of chess club",
        "Volunteer at local coding bootcamp",
        "Captain of rugby team" if gender=='male' else "Member of dance society"
    ])
    skills = random.sample(['python','java','c++','sql','machine learning','aws','docker','nlp'], k=3)
    return {
        'id': i,
        'name': name,
        'email': email,
        'phone': phone,
        'gender': gender,
        'education': education,
        'years_experience': years_exp,
        'extra_text': extra_text,
        'skills': ','.join(skills)
    }

# create DataFrame
rows = [generate_synthetic_resume(i) for i in range(1,401)]  # 400 rows
df = pd.DataFrame(rows)
df.to_csv('synthetic_resumes.csv', index=False)
print('Saved synthetic_resumes.csv with', len(df), 'rows')
df.head()


In [None]:
# PII anonymization using Faker (consistent mapping)
import re
import pandas as pd
from faker import Faker
import hashlib
from collections import defaultdict

fake = Faker()
Faker.seed(42)

INPUT_CSV = 'synthetic_resumes.csv'
OUTPUT_CSV = 'anonymized_resumes.csv'

# Columns likely containing PII
NAME_COLUMNS = ['name']
EMAIL_COLUMNS = ['email']
PHONE_COLUMNS = ['phone']
TEXT_COLUMNS = ['extra_text']
SENSITIVE_ID_COLUMNS = ['id']  # we will hash this rather than expose

EMAIL_RE = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
PHONE_RE = re.compile(r'(\+?\d[\d\-\s]{6,}\d)')

# mappings for consistent pseudonyms
name_map = {}
email_map = {}
phone_map = {}

def pseudonym_for_name(orig):
    if pd.isna(orig) or str(orig).strip()=='':
        return orig
    if orig in name_map:
        return name_map[orig]
    # generate pseudonym: fake last name + initials to keep some pattern
    pseud = fake.name()
    name_map[orig] = pseud
    return pseud

def pseudonym_for_email(orig):
    if pd.isna(orig) or str(orig).strip()=='':
        return orig
    if orig in email_map:
        return email_map[orig]
    local = fake.user_name()
    domain = fake.free_email_domain()
    pseud = f"{local}@{domain}"
    email_map[orig] = pseud
    return pseud

def pseudonym_for_phone(orig):
    if pd.isna(orig) or str(orig).strip()=='':
        return orig
    if orig in phone_map:
        return phone_map[orig]
    pseud = fake.phone_number()
    phone_map[orig] = pseud
    return pseud

def hash_value(val):
    if pd.isna(val) or str(val).strip()=='':
        return val
    return hashlib.sha256(str(val).encode()).hexdigest()[:12]

# load dataset
df = pd.read_csv(INPUT_CSV)

# replace explicit columns
for col in NAME_COLUMNS:
    if col in df.columns:
        df[col] = df[col].apply(pseudonym_for_name)

for col in EMAIL_COLUMNS:
    if col in df.columns:
        df[col] = df[col].apply(pseudonym_for_email)

for col in PHONE_COLUMNS:
    if col in df.columns:
        df[col] = df[col].apply(pseudonym_for_phone)

# scan text columns
for col in TEXT_COLUMNS:
    if col in df.columns:
        def replace_in_text(text):
            if pd.isna(text):
                return text
            s = str(text)
            # replace emails
            for m in EMAIL_RE.findall(s):
                s = s.replace(m, pseudonym_for_email(m))
            # replace phones
            for m in PHONE_RE.findall(s):
                s = s.replace(m, pseudonym_for_phone(m))
            return s
        df[col] = df[col].apply(replace_in_text)

# pseudonymize/hide sensitive ids by hashing
for col in SENSITIVE_ID_COLUMNS:
    if col in df.columns:
        df[col] = df[col].apply(hash_value)

# save anonymized
df.to_csv(OUTPUT_CSV, index=False)
print('Saved anonymized file to', OUTPUT_CSV)
print('Mappings sizes -> names:', len(name_map), 'emails:', len(email_map), 'phones:', len(phone_map))
df.head()


In [None]:
# Simple fairness check: selection rate by gender
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load anonymized data
df = pd.read_csv('anonymized_resumes.csv')

# For demonstration we create a synthetic 'selected' label:
# Simple heuristic: higher years_experience and certain skills increase selection chance
import numpy as np
def synthetic_selection(row):
    score = row['years_experience']
    skills = row['skills']
    if 'machine learning' in skills: score += 2
    if 'python' in skills: score += 1
    # random noise
    score += np.random.normal(0,1)
    return 1 if score > 5 else 0

df['selected'] = df.apply(synthetic_selection, axis=1)

# Compute selection rates by gender
selection_rates = df.groupby('gender')['selected'].mean().reset_index()
selection_rates.columns = ['gender','selection_rate']
print(selection_rates)

# Plot selection rates
plt.figure(figsize=(6,4))
plt.bar(selection_rates['gender'], selection_rates['selection_rate'])
plt.title('Selection rate by gender (synthetic label)')
plt.xlabel('Gender')
plt.ylabel('Selection rate')
plt.ylim(0,1)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.savefig('fairness_plot.png', bbox_inches='tight')
plt.show()
print('Saved fairness_plot.png')


## What you get after running this notebook

- `synthetic_resumes.csv` — synthetic example data (no real people)  
- `anonymized_resumes.csv` — anonymized output from the PII script  
- `fairness_plot.png` — selection rate plot by gender  
- Notebook cells with code you can adapt for real pipelines (remember: never push real PII to public repos)

### Next steps (suggestions)
- Add more sophisticated PII detection (NER with spaCy) for free-text CVs.  
- Integrate fairness metrics from `fairlearn` or `aif360` for deeper analysis.  
- Document everything in `/weeks/week8/ethics_security_privacy/Week8_Case_Anon.md` and publish to your GitHub repo.

---

If you want, I can also:
- generate the `.md` file for the anonymization report, or
- create and upload these files directly to your GitHub (I can provide exact git commands).
