<a href="https://colab.research.google.com/github/TCU-DCDA/WRIT20833-2025/blob/main/notebooks/exercises/Review_06_Pandas_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# WRIT 20833 Review 06: Pandas for Data Analysis


Learn to analyze cultural datasets using the Pandas library.

**Make a copy:** File > Save a copy in Drive

## Exercise 1: Getting Started with Pandas
Import Pandas and create basic DataFrames with cultural data.

In [None]:
# Import pandas library
import pandas as pd

# Create a DataFrame from cultural data
books_data = {
    'title': ['1984', 'Pride and Prejudice', 'The Handmaid\'s Tale', 'Beloved', 'The Great Gatsby'],
    'author': ['George Orwell', 'Jane Austen', 'Margaret Atwood', 'Toni Morrison', 'F. Scott Fitzgerald'],
    'year': [1949, 1813, 1985, 1987, 1925],
    'pages': [328, 432, 311, 275, 180],
    'genre': ['Dystopian', 'Romance', 'Dystopian', 'Historical Fiction', 'Modernist']
}

# Create DataFrame
books_df = pd.DataFrame(books_data)

# Display basic information
print("Books DataFrame:")
print(books_df)
print()
print("Shape: " + str(books_df.shape))  # (rows, columns)
print()
print("Column names: " + str(list(books_df.columns)))
print()
print("Data types:")
print(books_df.dtypes)

# Display first few rows
print()
print("First 3 rows:")
print(books_df.head(3))

## Exercise 2: Exploring and Selecting Data
Practice selecting columns, rows, and filtering data.

In [None]:
# Selecting columns
print("Book titles:")
print(books_df['title'])
print()

# Selecting multiple columns  
print("Title and Author:")
print(books_df[['title', 'author']])
print()

# Filtering data based on conditions
print("Books published after 1950:")
modern_books = books_df[books_df['year'] > 1950]
print(modern_books[['title', 'year']])
print()

print("Books longer than 300 pages:")
long_books = books_df[books_df['pages'] > 300]
print(long_books[['title', 'pages']])
print()

# Filter for specific genre
print("Dystopian books:")
dystopian_books = books_df[books_df['genre'] == 'Dystopian']
print(dystopian_books[['title', 'genre']])
print()

# Filter for specific author
print("Books by George Orwell:")
orwell_books = books_df[books_df['author'] == 'George Orwell']
print(orwell_books[['title', 'author']])

## Exercise 3: Basic Statistics and Aggregation
Calculate summary statistics for cultural datasets.

In [None]:
# Basic statistical information  
print("Numerical columns summary:")
print(books_df.describe())
print()

# Specific statistics
print("Page statistics:")
mean_pages = books_df['pages'].mean()
print("Average pages: " + str(round(mean_pages, 1)))
print("Median pages: " + str(books_df['pages'].median()))
print("Total pages: " + str(books_df['pages'].sum()))
print("Minimum pages: " + str(books_df['pages'].min()))
print("Maximum pages: " + str(books_df['pages'].max()))
print()

print("Publication year range:")
print("Earliest: " + str(books_df['year'].min()))
print("Latest: " + str(books_df['year'].max()))
year_span = books_df['year'].max() - books_df['year'].min()
print("Year span: " + str(year_span) + " years")
print()

# Value counts for categorical data
print("Books by genre:")
print(books_df['genre'].value_counts())
print()

print("Books by author:")  
print(books_df['author'].value_counts())

## Exercise 4: Working with Larger Cultural Datasets
Create and analyze a more comprehensive cultural dataset.

In [None]:
# Create dataset with cultural works
cultural_works = {
    'title': [
        'Hamlet', 'The Starry Night', 'Symphony No. 9', 'Citizen Kane',
        'The Great Wave', 'Abbey Road', 'Guernica', 'Casablanca'
    ],
    'creator': [
        'Shakespeare', 'Van Gogh', 'Beethoven', 'Orson Welles',
        'Hokusai', 'The Beatles', 'Pablo Picasso', 'Michael Curtiz'
    ],
    'year': [1601, 1889, 1824, 1941, 1831, 1969, 1937, 1942],
    'medium': [
        'Theater', 'Painting', 'Music', 'Film', 
        'Printmaking', 'Music', 'Painting', 'Film'
    ],
    'country': [
        'England', 'France', 'Germany', 'USA',
        'Japan', 'England', 'Spain', 'USA'
    ],
    'influence_score': [95, 88, 92, 85, 78, 89, 91, 82]
}

# Create DataFrame
cultural_df = pd.DataFrame(cultural_works)

print("Cultural Works Dataset:")
print(cultural_df)
print()
print("Dataset contains " + str(len(cultural_df)) + " works")

# Analysis by medium
print()
print("Works by medium:")
print(cultural_df['medium'].value_counts())

# Analysis by country
print()
print("Works by country:")
print(cultural_df['country'].value_counts())

# Basic statistics on influence score
print()
print("Influence score statistics:")
print("Average influence: " + str(round(cultural_df['influence_score'].mean(), 1)))
print("Highest influence: " + str(cultural_df['influence_score'].max()))
print("Lowest influence: " + str(cultural_df['influence_score'].min()))

## Exercise 5: Data Cleaning and Transformation
Practice common data cleaning tasks.

In [None]:
# Create dataset with some messy data
messy_data = {
    'artist_name': ['vincent van gogh', 'PABLO PICASSO', 'Claude Monet', 'Georgia O\'Keeffe'],
    'birth_year': ['1853', '1881', '1840', '1887'],
    'nationality': ['Dutch', 'spanish', 'French', 'American'],
    'movement': ['Post-Impressionism', 'Cubism', 'Impressionism', 'Modernism']
}

messy_df = pd.DataFrame(messy_data)
print("Original messy data:")
print(messy_df)
print()

# Data cleaning operations
cleaned_df = messy_df.copy()

# 1. Standardize name capitalization
cleaned_df['artist_name'] = cleaned_df['artist_name'].str.title()

# 2. Convert birth_year to integer
cleaned_df['birth_year'] = cleaned_df['birth_year'].astype(int)

# 3. Standardize nationality capitalization
cleaned_df['nationality'] = cleaned_df['nationality'].str.capitalize()

print("Cleaned data:")
print(cleaned_df)
print()

# Basic analysis
print("Analysis of cleaned data:")
print("Birth year range: " + str(cleaned_df['birth_year'].min()) + " - " + str(cleaned_df['birth_year'].max()))
print()
print("Artists by nationality:")
print(cleaned_df['nationality'].value_counts())
print()
print("Artists by movement:")
print(cleaned_df['movement'].value_counts())

## Exercise 6: Sorting and Ranking
Learn to sort and rank cultural data.

In [None]:
# Sorting and ranking data
print("Original order:")
print(cultural_df[['title', 'creator', 'year', 'influence_score']])
print()

# Sort by year (oldest first)
print("Sorted by year (oldest first):")
year_sorted = cultural_df.sort_values('year')
print(year_sorted[['title', 'creator', 'year']])
print()

# Sort by influence score (highest first)
print("Sorted by influence score (highest first):")
influence_sorted = cultural_df.sort_values('influence_score', ascending=False)
print(influence_sorted[['title', 'creator', 'influence_score']])
print()

# Sort by multiple columns
print("Sorted by medium, then by year:")
multi_sorted = cultural_df.sort_values(['medium', 'year'])
print(multi_sorted[['title', 'medium', 'year']])
print()

# Find highest and lowest influence scores
print("Highest influence score:")  
highest = cultural_df[cultural_df['influence_score'] == cultural_df['influence_score'].max()]
print(highest[['title', 'creator', 'influence_score']])
print()

print("Lowest influence score:")
lowest = cultural_df[cultural_df['influence_score'] == cultural_df['influence_score'].min()]
print(lowest[['title', 'creator', 'influence_score']])

## Exercise 7: Your Turn - Analyze Cultural Data
Practice pandas skills with your own cultural dataset.

In [None]:
# TODO: Create your own cultural dataset
# Consider: What cultural domain interests you? (literature, music, art, film, etc.)
# Include at least 8-10 items with 4-5 attributes each

your_cultural_data = {
    # TODO: Replace with your data
    # Example structure:
    # 'name': ['Item 1', 'Item 2', 'Item 3', ...],
    # 'creator': ['Creator 1', 'Creator 2', 'Creator 3', ...],
    # 'year': [year1, year2, year3, ...],
    # 'category': ['Category 1', 'Category 2', 'Category 1', ...],
    # 'rating': [rating1, rating2, rating3, ...]
}

# TODO: Create DataFrame from your data
# your_df = pd.DataFrame(your_cultural_data)

# Example placeholder (replace with your actual data)
placeholder_data = {
    'item': ['Item A', 'Item B', 'Item C', 'Item D'],
    'creator': ['Creator 1', 'Creator 2', 'Creator 1', 'Creator 3'],
    'year': [2000, 1995, 2005, 2010],
    'category': ['Type X', 'Type Y', 'Type X', 'Type Z'],
    'score': [85, 92, 78, 88]
}

your_df = pd.DataFrame(placeholder_data)

print("Your Cultural Dataset:")
print(your_df)
print()

# TODO: Perform analysis on your data
print("Basic statistics:")
print(your_df.describe())
print()

print("Category distribution:")
print(your_df['category'].value_counts())
print()

# Simple analysis
print("Highest scoring item:")
highest_score = your_df[your_df['score'] == your_df['score'].max()]
print(highest_score)
print()

print("Items by Creator 1:")
creator1_items = your_df[your_df['creator'] == 'Creator 1']
print(creator1_items)
print()

print("Items sorted by score:")
sorted_items = your_df.sort_values('score', ascending=False)
print(sorted_items[['item', 'creator', 'score']])

print()
print("="*50)
print("TODO: Customize this section with your own cultural data and research questions!")

## Summary

You learned:
- Creating and exploring DataFrames with cultural data
- Selecting, filtering, and querying datasets
- Calculating statistics and performing aggregations
- Cleaning and transforming messy data
- Sorting, ranking, and comparing cultural works
- Grouping data by categories for analysis

**Key Pandas Skills:**
- `pd.DataFrame()` - Create DataFrames
- `.head()`, `.describe()`, `.shape` - Explore data
- `df[column]`, `df[condition]` - Select and filter
- `.groupby()`, `.agg()` - Aggregate data
- `.sort_values()`, `.rank()` - Order data
- `.str.title()`, `.astype()` - Clean data

**Next:** Review 07 will cover text analysis and sentiment analysis.

---
