In [ ]:
# 01-exploration.ipynb

import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Load the raw data
data_path = '../data/raw/doj_press_releases.jsonl'

def load_data(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f if line.strip()]

press_releases = load_data(data_path)

# Convert to DataFrame for analysis
df = pd.DataFrame(press_releases)

# Display basic information about the dataset
print(df.info())
print(df.head())

# Analyze the distribution of press release dates
df['date'] = pd.to_datetime(df['date'])
plt.figure(figsize=(12, 6))
sns.histplot(df['date'], bins=30, kde=True)
plt.title('Distribution of Press Release Dates')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

# Analyze the most common entities
entity_counts = df['entities'].explode().value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=entity_counts.index[:10], y=entity_counts.values[:10])
plt.title('Top 10 Most Common Entities')
plt.xlabel('Entities')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

# Further analysis can be added here
