# 01 â€“ Data Exploration and Collection

This notebook loads the keyword dataset, selects one event (Battle for Kyiv),
and collects a sample of Reddit comments related to that event using the 
Pushshift API. The goal is to build a small but relevant dataset that we can 
use later for preprocessing, sentiment analysis, and topic modeling.


In [1]:
import pandas as pd
import requests
import time
from datetime import datetime


In [3]:
# Load keyword dataset
keywords_path = "../data/processed/ukraine_project_keywords_events1to5.csv"
keywords_df = pd.read_csv(keywords_path)

keywords_df.head()


Unnamed: 0,Event,Category,Keyword
0,Event 1 - Battle for Kyiv,Proper Nouns,Kyiv
1,Event 1 - Battle for Kyiv,Proper Nouns,Kiev
2,Event 1 - Battle for Kyiv,Proper Nouns,Ukraine
3,Event 1 - Battle for Kyiv,Proper Nouns,Ukraine war
4,Event 1 - Battle for Kyiv,Proper Nouns,Russia


In [5]:
def get_event_keywords(event_name: str) -> list:
    """
    Return a list of unique keywords for a given event name.
    Example event_name: 'Event 1 - Battle for Kyiv'
    """
    subset = keywords_df[keywords_df["Event"] == event_name]
    return sorted(subset["Keyword"].dropna().unique().tolist())

event1_name = "Event 1 - Battle for Kyiv"
event1_keywords = get_event_keywords(event1_name)
event1_keywords


['64km convoy',
 'Ghost of Kyiv',
 'Kharkiv',
 'Kiev',
 'Kyiv',
 'Kyiv under attack',
 'Mariupol',
 'NATO',
 'Putin',
 'Russia',
 'Russia Ukraine',
 'Russia Ukraine news',
 'Russia nuclear',
 'Russia protest',
 'Russia sanctions',
 'Russian assault',
 'Russian convoy',
 'Russian economy',
 'Russian forces',
 'Russian invasion',
 'U.S. support',
 'Ukraine',
 'Ukraine Fox News',
 'Ukraine war',
 'Vladimir Putin',
 'Volodymyr Zelensky',
 'Western response',
 'Zelensky',
 'air raid',
 'airstrike',
 'battle for Kyiv',
 'bombardment',
 'casualties',
 'convoy',
 'death toll',
 'economic sanctions',
 'evacuation',
 'evacuees',
 'fleeing',
 'full-scale invasion',
 'ground forces',
 'humanitarian crisis',
 'hybrid war',
 'injured',
 'invasion',
 'missile strike',
 'offensive',
 'panic',
 'refugees',
 'sanctions',
 'shelling',
 'tanks']

In [7]:
def to_unix_timestamp(date_str: str) -> int:
    """
    Convert a date string 'YYYY-MM-DD' to a Unix timestamp (int).
    """
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.timestamp())

# Example for Event 1 window
start_date = "2022-02-20"
end_date   = "2022-03-20"

start_ts = to_unix_timestamp(start_date)
end_ts   = to_unix_timestamp(end_date)

start_ts, end_ts


(1645340400, 1647756000)

In [13]:
def fetch_reddit_comments_for_keyword(keyword: str,
                                      after_ts: int,
                                      before_ts: int,
                                      size: int = 100) -> pd.DataFrame:
    """
    Fetch a small sample of Reddit comments containing the keyword
    within the given time window [after_ts, before_ts].
    Uses Pushshift Reddit comment search.
    """
    url = "https://api.pushshift.io/reddit/comment/search/"
    params = {
        "q": keyword,
        "after": after_ts,
        "before": before_ts,
        "size": size,
        "lang": "en",
        "subreddit": "politics"
    }
    print(f"Fetching comments for keyword: {keyword}")
    r = requests.get(url, params=params)
    if r.status_code != 200:
        print("Request failed with status:", r.status_code)
        return pd.DataFrame()
    
    data = r.json().get("data", [])
    if not data:
        return pd.DataFrame()
    
    df = pd.DataFrame(data)
    # Keep only useful columns for now
    keep_cols = ["id", "created_utc", "subreddit", "body", "score", "author"]
    df = df[keep_cols]
    df["keyword"] = keyword
    return df


In [15]:
sample_keywords = event1_keywords[:8]  # first 8 keywords for now

all_dfs = []

for kw in sample_keywords:
    df_kw = fetch_reddit_comments_for_keyword(
        keyword=kw,
        after_ts=start_ts,
        before_ts=end_ts,
        size=100  # up to 100 comments per keyword
    )
    if not df_kw.empty:
        all_dfs.append(df_kw)
    # be kind to the API
    time.sleep(1)

if all_dfs:
    event1_comments = pd.concat(all_dfs, ignore_index=True)
else:
    event1_comments = pd.DataFrame()

event1_comments.head()


Fetching comments for keyword: 64km convoy
Request failed with status: 403
Fetching comments for keyword: Ghost of Kyiv
Request failed with status: 403
Fetching comments for keyword: Kharkiv
Request failed with status: 403
Fetching comments for keyword: Kiev
Request failed with status: 403
Fetching comments for keyword: Kyiv
Request failed with status: 403
Fetching comments for keyword: Kyiv under attack
Request failed with status: 403
Fetching comments for keyword: Mariupol
Request failed with status: 403
Fetching comments for keyword: NATO
Request failed with status: 403
