<a href="https://colab.research.google.com/github/TCU-DCDA/WRIT20833-2025/blob/main/notebooks/exercises/Review_05_Data_Ethics_Collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WRIT 20833 Review 05: Data Ethics & Collection Methods

**Student Name:** ___________________  
**Date:** ___________________  

Explore ethical data collection and responsible research practices.

**Make a copy:** File > Save a copy in Drive

## Exercise 1: Understanding Data Sources
Analyze different types of cultural data and their origins.

In [None]:
# Different types of cultural data sources
data_sources = {
    "social_media": {
        "type": "User-generated content",
        "examples": ["Twitter posts", "Instagram captions", "TikTok comments"],
        "ethical_concerns": ["Privacy", "Consent", "Context collapse"],
        "access_method": "APIs or scraping"
    },
    "historical_archives": {
        "type": "Digitized materials",
        "examples": ["Letters", "Newspapers", "Government records"],
        "ethical_concerns": ["Copyright", "Representation", "Missing voices"],
        "access_method": "Digital libraries"
    },
    "interviews": {
        "type": "Collected testimony",
        "examples": ["Oral histories", "Surveys", "Focus groups"],
        "ethical_concerns": ["Informed consent", "Anonymity", "Power dynamics"],
        "access_method": "Direct collection"
    },
    "public_records": {
        "type": "Official documents",
        "examples": ["Census data", "Court records", "Legislative proceedings"],
        "ethical_concerns": ["Individual privacy", "Contextual accuracy"],
        "access_method": "Government databases"
    }
}

# Analyze each source type
print("CULTURAL DATA SOURCE ANALYSIS")
print("=" * 40)

for source_name, details in data_sources.items():
    print(f"\n{source_name.replace('_', ' ').title()}:")
    print(f"  Type: {details['type']}")
    print(f"  Examples: {', '.join(details['examples'])}")
    print(f"  Access: {details['access_method']}")
    print(f"  Key ethical concerns: {', '.join(details['ethical_concerns'])}")

# Count ethical concerns
all_concerns = []
for details in data_sources.values():
    all_concerns.extend(details['ethical_concerns'])

print(f"\nTotal unique ethical concerns identified: {len(set(all_concerns))}")
print(f"Most common concerns: {set(all_concerns)}")

## Exercise 2: Evaluating Data Collection Methods
Practice assessing the ethics of different collection approaches.

In [None]:
# Function to evaluate data collection ethics
def evaluate_collection_method(method_name, description, consent_level, privacy_impact, potential_harm):
    \"\"\"Evaluate the ethical implications of a data collection method\"\"\" 
    
    # Calculate ethics score (higher = more ethical)
    consent_score = {"explicit": 3, "implied": 2, "none": 0}[consent_level]
    privacy_score = {"low": 3, "medium": 2, "high": 0}[privacy_impact]  # Lower impact = higher score
    harm_score = {"minimal": 3, "moderate": 2, "significant": 0}[potential_harm]
    
    total_score = consent_score + privacy_score + harm_score
    max_score = 9
    
    # Determine ethics rating
    if total_score >= 8:
        rating = "Highly Ethical"
    elif total_score >= 6:
        rating = "Moderately Ethical"
    elif total_score >= 4:
        rating = "Ethically Questionable"
    else:
        rating = "Ethically Problematic"
    
    print(f"Method: {method_name}")
    print(f"Description: {description}")
    print(f"Consent Level: {consent_level}")
    print(f"Privacy Impact: {privacy_impact}")
    print(f"Potential Harm: {potential_harm}")
    print(f"Ethics Score: {total_score}/{max_score}")
    print(f"Rating: {rating}")
    print("-" * 30)
    
    return {"method": method_name, "score": total_score, "rating": rating}

# Evaluate different collection methods
methods = [
    {
        "name": "Voluntary Survey",
        "description": "Participants voluntarily complete a survey about their cultural practices",
        "consent": "explicit",
        "privacy": "low",
        "harm": "minimal"
    },
    {
        "name": "Public Social Media Scraping",
        "description": "Collecting public posts from social media without notification",
        "consent": "none",
        "privacy": "medium",
        "harm": "moderate"
    },
    {
        "name": "Historical Archive Digitization",
        "description": "Digitizing letters and documents from historical archives",
        "consent": "implied",
        "privacy": "low",
        "harm": "minimal"
    },
    {
        "name": "Covert Observation",
        "description": "Secretly recording conversations in public spaces",
        "consent": "none",
        "privacy": "high",
        "harm": "significant"
    }
]

results = []
for method in methods:
    result = evaluate_collection_method(
        method["name"], 
        method["description"], 
        method["consent"], 
        method["privacy"], 
        method["harm"]
    )
    results.append(result)

# Summary of results
print("\nSUMMARY OF ETHICAL EVALUATIONS:")
for result in sorted(results, key=lambda x: x["score"], reverse=True):
    print(f"{result['method']}: {result['rating']} (Score: {result['score']}/9)")

## Exercise 3: Consent and Privacy Analysis
Examine consent models and privacy considerations.

In [None]:
# Different models of consent
consent_models = {
    "opt_in": {
        "description": "Users must actively choose to participate",
        "advantages": ["Clear consent", "Informed participation", "Higher ethical standard"],
        "disadvantages": ["Lower participation rates", "Selection bias", "More complex logistics"],
        "best_for": ["Sensitive topics", "Vulnerable populations", "Long-term studies"]
    },
    "opt_out": {
        "description": "Users are included by default but can choose to leave",
        "advantages": ["Higher participation", "More representative samples", "Easier logistics"],
        "disadvantages": ["Questionable consent", "May include unwilling participants", "Ethical concerns"],
        "best_for": ["Low-risk research", "Public data analysis", "Administrative studies"]
    },
    "public_domain": {
        "description": "Data is already publicly available",
        "advantages": ["No consent needed", "Large datasets", "Historical continuity"],
        "disadvantages": ["Context collapse", "Unintended use", "Privacy erosion"],
        "best_for": ["Historical analysis", "Public discourse studies", "Large-scale patterns"]
    }
}

# Function to analyze consent appropriateness
def analyze_consent_model(research_context, data_sensitivity, population_vulnerability):
    \"\"\"Recommend appropriate consent model based on research parameters\"\"\" 
    
    # Decision logic
    if data_sensitivity == "high" or population_vulnerability == "high":
        recommendation = "opt_in"
        reason = "High sensitivity or vulnerable population requires explicit consent"
    elif data_sensitivity == "medium" and population_vulnerability == "medium":
        recommendation = "opt_in"  # Err on side of caution
        reason = "Moderate risk factors suggest need for explicit consent"
    elif research_context == "historical" and data_sensitivity == "low":
        recommendation = "public_domain"
        reason = "Historical public data with low sensitivity"
    else:
        recommendation = "opt_out"
        reason = "Low risk factors allow for opt-out model with safeguards"
    
    return recommendation, reason

# Test different research scenarios
scenarios = [
    {"context": "social_media", "sensitivity": "medium", "vulnerability": "low", "description": "Analyzing public tweets about movies"},
    {"context": "interviews", "sensitivity": "high", "vulnerability": "high", "description": "Interviewing trauma survivors"},
    {"context": "historical", "sensitivity": "low", "vulnerability": "low", "description": "Analyzing 19th century newspapers"},
    {"context": "survey", "sensitivity": "medium", "vulnerability": "medium", "description": "Student academic experiences"}
]

print("CONSENT MODEL RECOMMENDATIONS")
print("=" * 40)

for scenario in scenarios:
    recommendation, reason = analyze_consent_model(
        scenario["context"], 
        scenario["sensitivity"], 
        scenario["vulnerability"]
    )
    
    print(f"\nScenario: {scenario['description']}")
    print(f"Context: {scenario['context']} | Sensitivity: {scenario['sensitivity']} | Vulnerability: {scenario['vulnerability']}")
    print(f"Recommended Model: {recommendation.replace('_', '-').title()}")
    print(f"Reason: {reason}")
    
    # Show model details
    model_info = consent_models[recommendation]
    print(f"Model Description: {model_info['description']}")
    print(f"Key Advantages: {', '.join(model_info['advantages'][:2])}")

## Exercise 4: Bias and Representation
Identify potential biases in cultural datasets.

In [None]:
# Function to analyze dataset representation
def analyze_dataset_bias(dataset_name, collection_method, source_demographics, missing_groups):
    \"\"\"Analyze potential biases in cultural datasets\"\"\" 
    
    # Identify bias types
    bias_types = []
    
    if "online" in collection_method.lower():
        bias_types.append("Digital divide bias")
    
    if "english" in source_demographics.lower():
        bias_types.append("Language bias")
    
    if "urban" in source_demographics.lower():
        bias_types.append("Geographic bias")
    
    if "college" in source_demographics.lower() or "university" in source_demographics.lower():
        bias_types.append("Educational bias")
    
    if missing_groups:
        bias_types.append("Systematic exclusion bias")
    
    # Calculate bias risk
    risk_level = "Low" if len(bias_types) <= 1 else "Medium" if len(bias_types) <= 3 else "High"
    
    print(f"Dataset: {dataset_name}")
    print(f"Collection Method: {collection_method}")
    print(f"Source Demographics: {source_demographics}")
    print(f"Missing Groups: {missing_groups if missing_groups else 'None identified'}")
    print(f"Identified Bias Types: {bias_types if bias_types else ['None identified']}")
    print(f"Bias Risk Level: {risk_level}")
    print("-" * 40)
    
    return {"dataset": dataset_name, "bias_count": len(bias_types), "risk": risk_level}

# Sample datasets to analyze
datasets = [
    {
        "name": "Twitter Literature Discussions",
        "method": "Online social media scraping",
        "demographics": "Primarily English-speaking, urban, college-educated users",
        "missing": "Rural communities, non-English speakers, older adults"
    },
    {
        "name": "Historical Newspaper Archive",
        "method": "Digital archive access",
        "demographics": "Major city newspapers, English language, 1900-2000",
        "missing": "Community papers, minority-owned publications, non-English press"
    },
    {
        "name": "Community Survey on Cultural Practices",
        "method": "Door-to-door interviews in multiple languages",
        "demographics": "Representative sample across age, income, ethnicity, geography",
        "missing": ""
    },
    {
        "name": "University Student Blogs",
        "method": "Web scraping of student publications",
        "demographics": "University students, primarily traditional college age, English",
        "missing": "Non-students, working adults, community college perspectives"
    }
]

print("DATASET BIAS ANALYSIS")
print("=" * 40)

results = []
for dataset in datasets:
    result = analyze_dataset_bias(
        dataset["name"],
        dataset["method"],
        dataset["demographics"],
        dataset["missing"]
    )
    results.append(result)

# Summary statistics
high_risk = sum(1 for r in results if r["risk"] == "High")
medium_risk = sum(1 for r in results if r["risk"] == "Medium")
low_risk = sum(1 for r in results if r["risk"] == "Low")

print(f"\nBIAS RISK SUMMARY:")
print(f"High Risk: {high_risk} datasets")
print(f"Medium Risk: {medium_risk} datasets")
print(f"Low Risk: {low_risk} datasets")
print(f"\nRecommendation: Focus mitigation efforts on {high_risk + medium_risk} datasets with elevated bias risk.")

## Exercise 5: Ethical Decision Framework
Practice making ethical decisions about data use.

In [None]:
# Ethical decision-making framework
def ethical_decision_framework(research_question, data_source, potential_benefits, potential_harms, alternatives):
    \"\"\"Guide ethical decision-making about data use\"\"\" 
    
    print(f"ETHICAL DECISION FRAMEWORK")
    print(f"Research Question: {research_question}")
    print(f"Proposed Data Source: {data_source}")
    print()
    
    # Step 1: Benefits analysis
    print("STEP 1: Benefits Analysis")
    for i, benefit in enumerate(potential_benefits, 1):
        print(f"  {i}. {benefit}")
    
    # Step 2: Harm assessment
    print("\nSTEP 2: Potential Harms")
    for i, harm in enumerate(potential_harms, 1):
        print(f"  {i}. {harm}")
    
    # Step 3: Alternative approaches
    print("\nSTEP 3: Alternative Approaches")
    for i, alt in enumerate(alternatives, 1):
        print(f"  {i}. {alt}")
    
    # Step 4: Decision guidance
    print("\nSTEP 4: Decision Guidance Questions")
    questions = [
        "Do the benefits clearly outweigh the harms?",
        "Have you minimized potential harms through design choices?",
        "Are there less harmful alternatives that could answer your question?",
        "Would the people whose data you're using consent if they knew?",
        "Does your research serve the interests of the communities studied?"
    ]
    
    for i, question in enumerate(questions, 1):
        print(f"  {i}. {question}")
    
    # Simple recommendation logic
    harm_count = len(potential_harms)
    alt_count = len(alternatives)
    
    if harm_count <= 1 and alt_count >= 2:
        recommendation = "Consider alternatives first"
    elif harm_count >= 3:
        recommendation = "High risk - requires strong justification"
    else:
        recommendation = "Proceed with careful safeguards"
    
    print(f"\nINITIAL RECOMMENDATION: {recommendation}")
    print(f"\nNext Steps: Consult with IRB, advisors, and community stakeholders.")
    
    return recommendation

# Test case: Social media research
recommendation = ethical_decision_framework(
    research_question="How do young people discuss mental health on social media?",
    data_source="Public Twitter posts containing mental health keywords",
    potential_benefits=[
        "Better understanding of youth mental health discourse",
        "Inform mental health support programs",
        "Identify patterns that could help early intervention"
    ],
    potential_harms=[
        "Privacy violation for vulnerable individuals",
        "Risk of re-identification despite public posts", 
        "Potential stigmatization of communities",
        "Taking posts out of original context"
    ],
    alternatives=[
        "Partner with mental health organizations for voluntary participation",
        "Use synthetic data based on patterns rather than actual posts",
        "Focus on aggregate trends rather than individual posts",
        "Conduct interviews with explicit informed consent"
    ]
)

## Exercise 6: Your Research Ethics Plan
Develop an ethics plan for your own research interests.

In [None]:
# TODO: Define your research area and data needs
your_research = {
    "field": "Your field of study (e.g., literature, history, art, etc.)",
    "question": "Your specific research question",
    "data_needed": "What kind of data would help answer your question?",
    "population": "Who/what would you be studying?",
    "timeframe": "Historical period or contemporary?"
}

# TODO: Identify potential data sources for your research
potential_sources = [
    # Add your potential data sources here
    # Examples: "Digital archives", "Social media posts", "Interviews", etc.
]

# TODO: Consider ethical implications
ethical_considerations = {
    "consent_challenges": [],  # What makes consent difficult in your field?
    "privacy_risks": [],       # What privacy risks exist?
    "representation_gaps": [], # Who might be excluded from your data?
    "potential_harms": [],     # How could your research cause harm?
    "community_benefits": []   # How does your research serve the communities studied?
}

# TODO: Develop mitigation strategies
mitigation_strategies = [
    # Add your strategies for addressing ethical concerns
    # Examples: "Partner with community organizations", "Use anonymization", etc.
]

# Function to display your ethics plan
def display_ethics_plan(research_info, sources, considerations, mitigations):
    print("YOUR RESEARCH ETHICS PLAN")
    print("=" * 40)
    
    print("RESEARCH OVERVIEW:")
    for key, value in research_info.items():
        print(f"  {key.replace('_', ' ').title()}: {value}")
    
    print(f"\nPOTENTIAL DATA SOURCES:")
    for i, source in enumerate(sources, 1):
        print(f"  {i}. {source}")
    
    print(f"\nETHICAL CONSIDERATIONS:")
    for category, items in considerations.items():
        if items:  # Only show categories that have items
            print(f"  {category.replace('_', ' ').title()}:")
            for item in items:
                print(f"    - {item}")
    
    print(f"\nMITIGATION STRATEGIES:")
    for i, strategy in enumerate(mitigations, 1):
        print(f"  {i}. {strategy}")
    
    print(f"\nNEXT STEPS:")
    print(f"  1. Consult with faculty advisor about ethical considerations")
    print(f"  2. Research IRB requirements for your institution")
    print(f"  3. Identify relevant community stakeholders to consult")
    print(f"  4. Develop detailed data management and privacy protocols")

# Display your plan (will show placeholder text until you customize it)
display_ethics_plan(your_research, potential_sources, ethical_considerations, mitigation_strategies)

print("\n" + "=" * 40)
print("REFLECTION: Customize the variables above with your specific research interests and ethical considerations.")

## Summary

You explored:
- Different types of cultural data sources and their ethical implications
- Methods for evaluating data collection approaches
- Consent models and privacy considerations
- Identifying and addressing bias in datasets
- Frameworks for ethical decision-making
- Developing ethics plans for your own research

**Key Principles:**
- Prioritize consent and transparency
- Consider potential harms and benefits
- Address representation and bias
- Serve the communities you study
- Consult with stakeholders and ethics boards

**Next:** Review 06 will cover Pandas for data analysis.

 