In [6]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Read the FOMC data
df = pd.read_csv('merged_fomc_market_data.csv')

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Simple word tokenization
    tokens = text.split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

# Process all speeches
all_tokens = []
for text in df['clean_text']:
    tokens = preprocess_text(text)
    all_tokens.extend(tokens)

# Calculate word frequencies
word_freq = Counter(all_tokens)

# Create DataFrame of word frequencies
freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['frequency'])
freq_df.index.name = 'word'
freq_df = freq_df.sort_values('frequency', ascending=False)

# Save frequency analysis to CSV
freq_df.to_csv('word_frequencies.csv')

# Create visualizations

# 1. Top 20 most frequent words
plt.figure(figsize=(15, 8))
sns.barplot(x=freq_df.head(20).index, y='frequency', data=freq_df.head(20))
plt.xticks(rotation=45)
plt.title('Top 20 Most Frequent Words in FOMC Statements')
plt.tight_layout()
plt.savefig('top_20_words.png')
plt.close()

# 2. Word frequencies by market reaction
up_tokens = []
down_tokens = []

for text, reaction in zip(df['clean_text'], df['market_reaction_up_or_down']):
    tokens = preprocess_text(text)
    if reaction == 'Up':
        up_tokens.extend(tokens)
    else:
        down_tokens.extend(tokens)

up_freq = Counter(up_tokens)
down_freq = Counter(down_tokens)

# Get top 20 words for each market reaction
common_words = list(set(dict(up_freq.most_common(20)).keys()) | set(dict(down_freq.most_common(20)).keys()))
comparison_df = pd.DataFrame({
    'Up': [up_freq.get(word, 0) for word in common_words],
    'Down': [down_freq.get(word, 0) for word in common_words]
}, index=common_words)

# Normalize frequencies
comparison_df['Up'] = comparison_df['Up'] / sum(up_freq.values())
comparison_df['Down'] = comparison_df['Down'] / sum(down_freq.values())

# Plot comparison
plt.figure(figsize=(15, 8))
comparison_df.plot(kind='bar')
plt.title('Word Frequencies by Market Reaction (Normalized)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('market_reaction_comparison.png')
plt.close()

# Add market reaction analysis to the frequency CSV
freq_df['up_frequency'] = freq_df.index.map(lambda x: up_freq.get(x, 0) / sum(up_freq.values()))
freq_df['down_frequency'] = freq_df.index.map(lambda x: down_freq.get(x, 0) / sum(down_freq.values()))
freq_df['frequency_difference'] = freq_df['up_frequency'] - freq_df['down_frequency']
freq_df.to_csv('word_frequencies_with_market_reaction.csv')

print("Analysis complete! Check the generated CSV files and visualizations.")


[nltk_data] Downloading package punkt to /home/si295/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/si295/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/si295/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/si295/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Analysis complete! Check the generated CSV files and visualizations.


<Figure size 1500x800 with 0 Axes>