In [None]:
'''
INFO_511_ Application Exercise 06: Wildcat Scrape
Author: Todd Adams
Date: 2023-10-02
Description: We are scraping data from the Arizona Daily Wildcat.
'''

# Load Packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load the Dataset
wildcat = pd.read_csv('data/wildcat.csv')
wildcat.head()
wildcat.info()

'''
Who are the most prolific authors of the 100 most recent
articles in The Arizona Daily Wildcat?'''

# Convert the date column to datetime
wildcat['date'] = pd.to_datetime(wildcat['date'], format='%m/%d/%Y')
wildcat['date'].head()
wildcat['date'].info()

# Sort by date in descending order and select the 100 most recent articles
most_recent_100 = wildcat.sort_values(by='date', ascending=False).head(100)
most_recent_100.head()
most_recent_100.info()

# Count how many articles each author has written
author_counts = most_recent_100['author'].value_counts().reset_index()
author_counts.columns = ['author', 'article_count']

# Display the results
print(author_counts.head())
print(f'Total number of authors: {len(author_counts)}')
print(f'Total number of articles: {most_recent_100.shape[0]}')

'''
Draw a line plot of the number of opinion articles published
per dat in The Arizona Daily Wildcat.
'''
# Ensure 'date' column is in datetime format
wildcat['date'] = pd.to_datetime(wildcat['date'])

# Count number of articles per day
articles_per_day = wildcat['date'].value_counts().sort_index().reset_index()
articles_per_day.columns = ['date', 'count']

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=articles_per_day, x='date', y='count', marker='o')
plt.title('Number of Opinion Articles Published Per Day')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

'''
What percent of the most recent 100 oinion articles in The Arizona Daily Wildcat
mention "climate" in their title?
'''
# Make sure titles are lowercase to match "climate" regardless of capitalization
most_recent_100['title_lower'] = most_recent_100['title'].str.lower()

# Check for the word "climate" in each title
most_recent_100['climate_mentioned'] = most_recent_100['title_lower'].apply(
    lambda x: 'mentioned' if 'climate' in x else 'not mentioned'
)

# Calculate the percentage of mentions
climate_mentions = most_recent_100['climate_mentioned'].value_counts(normalize=True).reset_index()
climate_mentions.columns = ['climate_mentioned', 'percentage']

# Display the result
print(climate_mentions)
print(f"Percentage of articles mentioning 'climate': {climate_mentions.loc[0, 'percentage'] * 100:.2f}%")

