In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from collections import Counter
import re
import os

# Print current working directory for debugging
print("Current Working Directory:", os.getcwd())

# Load the dataset
try:
    data = pd.read_csv('../data/raw_analyst_ratings/raw_analyst_ratings.csv')
    print("Data loaded successfully. Shape:", data.shape)
except FileNotFoundError:
    raise FileNotFoundError("CSV file not found. Check the path: '../data/raw_analyst_ratings/raw_analyst_ratings.csv'")

# --- 1. Data Cleaning ---
# Handle missing values
print("\nMissing values before cleaning:")
print(data.isnull().sum())

data = data.dropna(subset=['headline', 'publisher', 'date'])  # Drop rows with critical missing data

# --- 2. Date Parsing Fix ---
def parse_dates(date_str):
    try:
        # Try parsing with timezone first
        return pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S%z', utc=True)
    except ValueError:
        try:
            # Fallback to naive datetime
            return pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S', utc=True)
        except ValueError:
            return pd.NaT

data['date'] = data['date'].apply(parse_dates)
data = data.dropna(subset=['date'])  # Remove rows where date parsing failed
print("\nMissing dates after parsing:", data['date'].isna().sum())

# --- 3. Descriptive Statistics ---
# Headline length
data['headline_length'] = data['headline'].apply(len)
print("\nHeadline Length Statistics:")
print(data['headline_length'].describe())

# Articles per publisher
publisher_counts = data['publisher'].value_counts()
print("\nArticles per Publisher:")
print(publisher_counts.head(10))  # Top 10 publishers

# --- 4. Time Series Analysis ---
# Publication frequency
data['date_only'] = data['date'].dt.date
date_counts = data['date_only'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
date_counts.plot(kind='line', title='Article Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.savefig('reports/publication_trend.png')
plt.close()

# Hourly distribution
data['hour'] = data['date'].dt.hour
hourly_counts = data['hour'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
hourly_counts.plot(kind='bar', title='Article Publication by Hour')
plt.xlabel('Hour of Day (UTC)')
plt.ylabel('Number of Articles')
plt.savefig('reports/hourly_publication.png')
plt.close()

# --- 5. Text Analysis (Topic Modeling) ---
def extract_keywords(text):
    words = re.findall(r'\w+', text.lower())
    return [word for word in words if len(word) > 3]  # Filter short words

all_words = data['headline'].apply(extract_keywords).explode()
common_words = Counter(all_words).most_common(20)
print("\nTop 20 Common Words in Headlines:")
print(common_words)

# Basic sentiment analysis
data['sentiment'] = data['headline'].apply(lambda x: TextBlob(x).sentiment.polarity)
print("\nSentiment Statistics:")
print(data['sentiment'].describe())

# --- 6. Publisher Analysis ---
# Extract domains if publishers are email addresses
data['domain'] = data['publisher'].apply(lambda x: x.split('@')[-1] if '@' in str(x) else x)
domain_counts = data['domain'].value_counts()
print("\nTop 10 Publisher Domains:")
print(domain_counts.head(10))

# --- 7. Save Processed Data ---
data.to_csv('../data/processed_analyst_ratings.csv', index=False)

# Commit the results
print("EDA completed. Visualizations saved in reports/. Processed data saved in data/.")

Current Working Directory: c:\Users\Simbo\Desktop\week1-challenge\notebooks
Data loaded successfully. Shape: (1407328, 6)

Missing values before cleaning:
Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64

Missing dates after parsing: 0

Headline Length Statistics:
count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64

Articles per Publisher:
publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64

Top 20 Common Words in Headlines:
[('stocks', 161776), ('from', 120805), ('market', 120558), ('shares', 114313), ('repo