In [None]:
'''
INFO_511_ Application Exercise 06: Wildcat Scrape
Author: Todd Adams
Date: 2023-10-02
Description: We are scraping data from the Arizona Daily Wildcat.
'''

# Load Packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Load the Dataset
wildcat = pd.read_csv('data/wildcat.csv')
wildcat.head()
wildcat.info()

'''
Who are the most prolific authors of the 100 most recent
articles in The Arizona Daily Wildcat?'''

# Convert the date column to datetime
wildcat['date'] = pd.to_datetime(wildcat['date'], format='%m/%d/%Y')
wildcat['date'].head()
wildcat['date'].info()

# Sort by date in descending order and select the 100 most recent articles
most_recent_100 = wildcat.sort_values(by='date', ascending=False).head(100)
most_recent_100.head()
most_recent_100.info()

# Count how many articles each author has written
author_counts = most_recent_100['author'].value_counts().reset_index()
author_counts.columns = ['author', 'article_count']

# Display the results
print(author_counts.head())
print(f'Total number of authors: {len(author_counts)}')
print(f'Total number of articles: {most_recent_100.shape[0]}')

'''
Draw a line plot of the number of opinion articles published
per dat in The Arizona Daily Wildcat.
'''
# Ensure 'date' column is in datetime format
wildcat['date'] = pd.to_datetime(wildcat['date'])

# Count number of articles per day
articles_per_day = wildcat['date'].value_counts().sort_index().reset_index()
articles_per_day.columns = ['date', 'count']

# Plot
plt.figure(figsize=(10, 6))
sns.lineplot(data=articles_per_day, x='date', y='count', marker='o')
plt.title('Number of Opinion Articles Published Per Day')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

'''
What percent of the most recent 100 oinion articles in The Arizona Daily Wildcat
mention "climate" in their title?
'''
# Make sure titles are lowercase to match "climate" regardless of capitalization
most_recent_100['title_lower'] = most_recent_100['title'].str.lower()

# Check for the word "climate" in each title
most_recent_100['climate_mentioned'] = most_recent_100['title_lower'].apply(
    lambda x: 'mentioned' if 'climate' in x else 'not mentioned'
)

# Calculate the percentage of mentions
climate_mentions = most_recent_100['climate_mentioned'].value_counts(normalize=True).reset_index()
climate_mentions.columns = ['climate_mentioned', 'percentage']

# Display the result
print(climate_mentions)
print(f"Percentage of articles mentioning 'climate': {climate_mentions.loc[0, 'percentage'] * 100:.2f}%")

'''
What percent of the most recent 100 opinion articles in The Arizona Daily Wildcat
mention “election” in their title or abstract?
'''

# If there's no 'abstract' column in the dataset, create an empty one to avoid errors
if 'abstract' not in most_recent_100.columns:
    most_recent_100['abstract'] = ""

# Convert title and abstract to lowercase
most_recent_100['title_lower'] = most_recent_100['title'].str.lower()
most_recent_100['abstract_lower'] = most_recent_100['abstract'].str.lower()

# Tag whether 'election' is mentioned in title or abstract
most_recent_100['election_mentioned'] = most_recent_100.apply(
    lambda row: 'mentioned' if 'election' in row['title_lower'] or 'election' in row['abstract_lower'] else 'not mentioned',
    axis=1
)

# Calculate percentage
election_mentions = most_recent_100['election_mentioned'].value_counts(normalize=True).reset_index()
election_mentions.columns = ['election_mentioned', 'percentage']

print(election_mentions)
print(f"Percentage of articles mentioning 'election': {election_mentions.loc[0, 'percentage'] * 100:.2f}%")

'''
On which day of the week are opinion articles most frequently published in The Arizona Daily Wildcat?
'''
# Create a new column for the day of the week
wildcat['day_of_week'] = wildcat['date'].dt.day_name()

# Count how many articles were published on each day
day_counts = wildcat['day_of_week'].value_counts().reset_index()
day_counts.columns = ['day', 'count']

# Display the result
print(day_counts)
print(f"Day with most articles: {day_counts.loc[0, 'day']} with {day_counts.loc[0, 'count']} articles")

# Plot the results
plt.figure(figsize=(8, 5))
sns.barplot(
    data=day_counts, x='day', y='count',
    order=[
        'Monday', 'Tuesday', 'Wednesday',
        'Thursday', 'Friday', 'Saturday', 'Sunday'
    ]
)
plt.title('Opinion Articles Published by Day of the Week')
plt.xlabel('Day')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
