# Data Exploration Notebook

This notebook provides initial exploration of the tweet dataset.

**Author**: Team

**Date**: 2024

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style('whitegrid')

## Load Data

In [None]:
# Load processed tweet data
df = pd.read_csv('../data/processed/tweets_clean.csv')
print(f"Loaded {len(df)} tweets")
df.head()

## Basic Statistics

In [None]:
# Dataset info
df.info()

In [None]:
# Summary statistics
df.describe()

## Temporal Analysis

In [None]:
# Convert date column
df['date'] = pd.to_datetime(df['date'])

# Plot tweet volume over time
daily_volume = df.groupby('date').size()

plt.figure(figsize=(14, 6))
plt.plot(daily_volume.index, daily_volume.values, linewidth=2)
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.title('Daily Tweet Volume')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Analysis

In [None]:
# Word count distribution
if 'word_count' in df.columns:
    plt.figure(figsize=(10, 6))
    plt.hist(df['word_count'], bins=50, edgecolor='black')
    plt.xlabel('Word Count')
    plt.ylabel('Frequency')
    plt.title('Distribution of Tweet Word Counts')
    plt.show()
    
    print(f"Average word count: {df['word_count'].mean():.1f}")
    print(f"Median word count: {df['word_count'].median():.1f}")

## Sample Tweets

In [None]:
# Display sample tweets
print("Sample tweets:")
for idx, row in df.sample(5).iterrows():
    print(f"\n{row['text'][:150]}...")

## Next Steps

1. Perform sentiment analysis
2. Identify topics/themes
3. Compare with polling data
4. Create visualizations