In [None]:
%pip install textblob

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Load Data
file_path = '../data/raw_analyst_ratings.csv'
data = pd.read_csv(file_path)

In [None]:
# Basic Inspection
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())

In [None]:
# Check for Missing Values
print("\nMissing Values:")
print(data.isnull().sum())

In [None]:
# Handle Missing Values
# Drop rows with any missing values
data = data.dropna()

print("\nMissing Values After Handling:")
print(data.isnull().sum())

In [None]:
# Descriptive Statistics
# Headline Length Analysis
data['headline_length'] = data['headline'].apply(len)
print("\nHeadline Length Statistics:")
print(data['headline_length'].describe())

In [None]:
# Publisher Analysis
publisher_counts = data['publisher'].value_counts()
print("\nTop Publishers:")
print(publisher_counts.head(10))

In [None]:
# Publication Date Trends
data['date'] = pd.to_datetime(data['date'], utc=True,format='ISO8601')
data['day_of_week'] = data['date'].dt.day_name()
print("\nPublication Count by Day of Week:")
print(data['day_of_week'].value_counts())


In [None]:
# Visualization - Headline Length Distribution
plt.figure(figsize=(10, 6))
plt.hist(data['headline_length'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Headline Length Distribution')
plt.xlabel('Length of Headline')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Visualization - Publisher Contributions
plt.figure(figsize=(10, 6))
publisher_counts.head(15).plot(kind='bar', color='skyblue')
plt.title('Top 10 Publishers by Number of Articles')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Text Analysis - Sentiment Analysis
def get_sentiment(text):
    sentiment = TextBlob(text).sentiment.polarity
    return sentiment

In [None]:
data['sentiment'] = data['headline'].apply(get_sentiment)
print("\nSentiment Statistics:")
print(data['sentiment'].describe())

In [None]:
# Visualization - Sentiment Distribution
plt.figure(figsize=(10, 6))
plt.hist(data['sentiment'], bins=30, edgecolor='k', alpha=0.7)
plt.title('Sentiment Score Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Keyword Extraction
vectorizer = CountVectorizer(stop_words='english', max_features=10)
X = vectorizer.fit_transform(data['headline'])
keywords = vectorizer.get_feature_names_out()
print("\nTop Keywords:")
print(keywords)

In [None]:
# Time Series Analysis - Publication Times
data['hour'] = data['date'].dt.hour
hourly_distribution = data['hour'].value_counts().sort_index()

plt.figure(figsize=(10, 6))
hourly_distribution = hourly_distribution.reindex(range(24), fill_value=0)
hourly_distribution.plot(kind='bar', color='orange')
plt.title('Publication Count by Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# Save Processed Data
data.to_csv('../data/processed_financial_news.csv', index=False)