# Imports & Setup

In [None]:
!pip install tensorflow



In [None]:
import os
import re
import json
import pickle
from datetime import datetime
from collections import defaultdict

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

import networkx as nx
import matplotlib.pyplot as plt

import gensim
from gensim import corpora, models

from transformers import pipeline
from textblob import TextBlob

import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import shutil

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Mount Google Drive in Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Helper Functions

In [None]:
#########################
# Preprocessing Functions
#########################

def clean_text(text):
    """
    Lowercase, remove non-alphanumeric characters,
    tokenize, and remove stopwords.
    """
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)


def clean_for_sentiment(text):
    """
    Minimal cleaning for sentiment analysis:
      - Remove HTML tags.
      - Remove URLs.
      - Remove extra whitespace.
    """
    text = re.sub(r'<[^>]+>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
#########################
# Model Setup
#########################

# Use GPU if available
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

def classify_ideology(text):
    """
    Use zero-shot classification to assign an ideology label.
    Candidate labels are "pro-Israeli" and "pro-Palestinian".
    Returns the highest scoring label.
    """
    if not text.strip():
        return "neutral"
    candidate_labels = ["pro-Israeli", "pro-Palestinian", "neutral"]
    try:
        result = zero_shot_classifier(text, candidate_labels)
        return result['labels'][0]
    except Exception as e:
        print("Classification error:", e)
        return "neutral"

def get_sentiment(text):
    """
    Compute sentiment polarity using TextBlob.
    Returns a polarity score in the range [-1, 1].
    """
    try:
        return TextBlob(text).sentiment.polarity
    except Exception as e:
        print("Sentiment error:", e)
        return 0.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
#########################
# Data Loading Function
#########################

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [None]:
# Test the zero-shot classification pipeline
test_texts = [
    "Israel has a strong right to defend itself in a challenging geopolitical climate.",
    "Palestinians have endured decades of oppression and deserve justice."
]
candidate_labels = ["pro-Israeli", "pro-Palestinian", "netural"]

# Run the pipeline on the test texts (using batching)
results = zero_shot_classifier(test_texts, candidate_labels=candidate_labels)

for text, res in zip(test_texts, results):
    print(f"Text: {text}")
    print(f"Predicted Label: {res['labels'][0]}")
    print(f"Scores: {res['scores']}")
    print("-" * 40)


Text: Israel has a strong right to defend itself in a challenging geopolitical climate.
Predicted Label: pro-Israeli
Scores: [0.8635569214820862, 0.12500016391277313, 0.011442952789366245]
----------------------------------------
Text: Palestinians have endured decades of oppression and deserve justice.
Predicted Label: pro-Palestinian
Scores: [0.8346067070960999, 0.1417410522699356, 0.023652270436286926]
----------------------------------------


# Constants configs

In [None]:
#########################
# Main Execution
#########################

# List of months from 09/23 to 10/24 (as strings "YYYY-MM")
months = ["2023-09", "2023-10", "2023-11", "2023-12", "2024-01", "2024-02",
          "2024-03", "2024-04", "2024-05", "2024-06", "2024-07", "2024-08",
          "2024-09", "2024-10"]

# Set the directory where the JSON files are stored (in Google Drive)
data_dir = '/content/drive/My Drive/IsraelPalestine subreddit data/processed_data'


output_dir = '/content/time_series_analysis'  # Local output in Colab
drive_output_dir = '/content/drive/My Drive/time_series_analysis'  # Also save to Drive

os.makedirs(output_dir, exist_ok=True)
os.makedirs(drive_output_dir, exist_ok=True)

# Process Monthly Data

In [None]:
def process_submissions_month(month_str, data_dir, batch_size=32):
    """
    Process a month's submissions from the full_data file.
    For each submission:
      - Extract raw text (title + selftext).
      - Compute sentiment using minimally cleaned text.
      - Record metadata (author, created_date, etc.).
      - Perform batched zero-shot classification for ideology using the candidate labels:
           "pro-Israeli", "pro-Palestinian", "neutral"

    Returns:
      - submissions: A list of processed submission records with "ideology" and "sentiment" assigned.
    """

    file_path = os.path.join(data_dir, f"IsraelPalestine_{month_str}_full_data.json")
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return []

    full_data = load_json(file_path)

    submissions = []
    batch_texts = []
    batch_indices = []

    for sub_id, sub_dict in full_data.items():
        submission = sub_dict.get("submission", {})
        text = (submission.get("title", "") + " " + submission.get("selftext", "")).strip()
        if text:
            # Use raw text for classification; use minimally cleaned text for sentiment.
            sentiment = get_sentiment(clean_for_sentiment(text))
            try:
                created_dt = datetime.strptime(submission.get("created_utc", ""), '%Y-%m-%d %H:%M:%S')
                created_date = created_dt.date()
            except Exception:
                created_date = None
            record = {
                "id": sub_id,
                "author": submission.get("author", None),
                "author_fullname": submission.get("author_fullname", None),
                "type": "submission",
                "text": text,
                "ideology": None,  # to be filled below
                "sentiment": sentiment,
                "created_date": created_date,
                "score": submission.get("score", 0),
                "num_comments": submission.get("num_comments", 0),
                "upvote_ratio": submission.get("upvote_ratio", 0)
            }
            submissions.append(record)
            batch_texts.append(text.strip())
            batch_indices.append(len(submissions) - 1)

    # Batched zero-shot classification for ideology.
    nonempty_batch = [(t, idx) for t, idx in zip(batch_texts, batch_indices) if t.strip() != ""]
    if nonempty_batch:
        texts_to_classify, indices_to_update = zip(*nonempty_batch)
        candidate_labels = ["pro-Israeli", "pro-Palestinian", "neutral"]
        try:
            results = zero_shot_classifier(list(texts_to_classify),
                                           candidate_labels=candidate_labels,
                                           batch_size=batch_size)
        except Exception as e:
            print("Ideology classification error:", e)
            results = [{"labels": ["neutral"]} for _ in range(len(texts_to_classify))]
        for i, res in enumerate(results):
            top_label = res['labels'][0] if res.get('labels') else "neutral"
            submissions[indices_to_update[i]]["ideology"] = top_label

    return submissions

In [None]:
#########################
# Process All Months (Saving Data After Each Month)
#########################
def process_all_months_submissions(data_dir, months, output_dir, drive_output_dir, save_each_month=True):
    """
    Process all months, aggregating contributions.
    For each month, if processed files already exist in drive_output_dir,
    load them; otherwise, process and save.

    Returns:
      - all_contributions: List of processed submissions contributions across months.
    """
    import shutil

    all_contributions = []

    for month_str in tqdm(months, desc="Processing months"):
        tqdm.write(f"Processing month: {month_str}")

        # Define expected output file paths for this month in both local and Drive directories.
        contrib_file_local = os.path.join(output_dir, f"processed_contributions_{month_str}.json")
        contrib_file_drive = os.path.join(drive_output_dir, f"processed_contributions_{month_str}.json")

        full_file = os.path.join(data_dir, f"IsraelPalestine_{month_str}_full_data.json")
        if not os.path.exists(full_file):
            print(f"File not found: {full_file}")
            continue

        # Check if processed files exist in drive
        if os.path.exists(contrib_file_drive):
            with open(contrib_file_drive, "r", encoding="utf-8") as f:
                contributions = json.load(f)
            print(f"Loaded processed data from Drive for month: {month_str}")

            # if file exsists load it to local
            if not os.path.exists(contrib_file_local):
                shutil.copy(contrib_file_drive, contrib_file_local)

        # Process the month since no processed file exists.
        else:

            full_data = load_json(full_file)
            contributions = process_submissions_month(month_str, data_dir)
            print(f"Processed data for month: {month_str}")

            # Now, save the updated graph and contributions to both local and Drive directories.
            if save_each_month:
                with open(contrib_file_local, "w", encoding="utf-8") as f:
                    json.dump(contributions, f, default=str, indent=2)

                with open(contrib_file_drive, "w", encoding="utf-8") as f:
                    json.dump(contributions, f, default=str, indent=2)

                print(f"Updated and saved data for month: {month_str}")

        all_contributions.extend(contributions)

    return all_contributions

In [None]:
# Process all months and save each month's data separately (or load if already processed).
all_contributions  =  process_all_months_submissions(data_dir, months, output_dir, drive_output_dir, save_each_month=True)

Processing months:   0%|          | 0/14 [00:00<?, ?it/s]

Processing month: 2023-09


Processing months:   7%|▋         | 1/14 [00:01<00:24,  1.85s/it]

Loaded processed data from Drive for month: 2023-09
Processing month: 2023-10


Processing months:  14%|█▍        | 2/14 [00:02<00:15,  1.29s/it]

Loaded processed data from Drive for month: 2023-10
Processing month: 2023-11


Processing months:  21%|██▏       | 3/14 [00:03<00:11,  1.06s/it]

Loaded processed data from Drive for month: 2023-11
Processing month: 2023-12


Processing months:  29%|██▊       | 4/14 [00:04<00:08,  1.17it/s]

Loaded processed data from Drive for month: 2023-12
Processing month: 2024-01


Processing months:  36%|███▌      | 5/14 [00:04<00:06,  1.33it/s]

Loaded processed data from Drive for month: 2024-01
Processing month: 2024-02


Processing months:  43%|████▎     | 6/14 [00:05<00:05,  1.47it/s]

Loaded processed data from Drive for month: 2024-02
Processing month: 2024-03


Processing months:  50%|█████     | 7/14 [00:05<00:04,  1.58it/s]

Loaded processed data from Drive for month: 2024-03
Processing month: 2024-04


Processing months:  57%|█████▋    | 8/14 [00:06<00:03,  1.74it/s]

Loaded processed data from Drive for month: 2024-04
Processing month: 2024-05


Processing months:  64%|██████▍   | 9/14 [00:06<00:02,  1.77it/s]

Loaded processed data from Drive for month: 2024-05
Processing month: 2024-06


Processing months:  71%|███████▏  | 10/14 [00:07<00:02,  1.74it/s]

Loaded processed data from Drive for month: 2024-06
Processing month: 2024-07


Processing months:  79%|███████▊  | 11/14 [00:07<00:01,  1.76it/s]

Loaded processed data from Drive for month: 2024-07
Processing month: 2024-08


Processing months:  86%|████████▌ | 12/14 [00:08<00:01,  1.93it/s]

Loaded processed data from Drive for month: 2024-08
Processing month: 2024-09


Processing months:  93%|█████████▎| 13/14 [00:08<00:00,  1.95it/s]

Loaded processed data from Drive for month: 2024-09
Processing month: 2024-10


Processing months: 100%|██████████| 14/14 [00:09<00:00,  1.52it/s]

Loaded processed data from Drive for month: 2024-10





In [None]:
# Save the aggregated processed contributions to drive.
with open(os.path.join(drive_output_dir, "processed_contributions.json"), "w", encoding="utf-8") as f:
    json.dump(all_contributions, f, default=str, indent=2)

# Also, create a pandas DataFrame and save as CSV.
df = pd.DataFrame(all_contributions)
df.to_csv(os.path.join(drive_output_dir, "processed_contributions.csv"), index=False)

# Process Monthly Social Networks

In [None]:
def build_social_network_from_user_stats(month_str, drive_dir, batch_size=32):
    """
    Build a social network graph for a month using preprocessed user statistics
    and user activity files from drive_dir.

    Expected files (for example):
      - IsraelPalestine_{month_str}_user_stats.json
      - IsraelPalestine_{month_str}_user_activity.json

    For each user, the function aggregates all texts (from submissions and comments)
    into one aggregated string, computes overall sentiment, and classifies the overall
    ideology using batched zero-shot classification with candidate labels:
         ["pro-Israeli", "pro-Palestinian", "neutral"]

    Node attributes for each user will include:
      - avg_sentiment: The overall sentiment score computed on aggregated text.
      - ideology: The predicted ideology from zero-shot classification.
      - total_submissions: As given in the user stats.
      - total_comments: As given in the user stats.

    Edges are created between users if one commented on another’s submission.
    (This code assumes that in the user activity file, each submission record includes a
     list "comment_authors" with the usernames of those who commented on that submission.)

    Returns:
      - G: The social network graph for the month.
    """
    # Construct file paths:
    stats_file = os.path.join(drive_dir, f"IsraelPalestine_{month_str}_user_stats.json")
    activity_file = os.path.join(drive_dir, f"IsraelPalestine_{month_str}_user_activity.json")

    if not os.path.exists(stats_file) or not os.path.exists(activity_file):
        print(f"User stats or activity file not found for month: {month_str}")
        return None

    user_stats = load_json(stats_file)
    user_activity = load_json(activity_file)

    # Initialize the graph with nodes from user_stats.
    G = nx.Graph()
    for user, stats in user_stats.items():
        G.add_node(user,
                   total_submissions=stats.get("total_submissions", 0),
                   total_comments=stats.get("total_comments", 0),
                   avg_sentiment=None,
                   ideology=None)

    # Aggregate texts per user from user_activity.
    aggregated_text = {}
    for user, activities in user_activity.items():
        texts = []
        for sub in activities.get("submissions", []):
            # Combine title and selftext if available.
            title = sub.get("title", "")
            selftext = sub.get("selftext", "")
            combined = (title + " " + selftext).strip()
            if combined:
                texts.append(combined)
        for comm in activities.get("comments", []):
            body = comm.get("body", "")
            if body:
                texts.append(body.strip())
        if texts:
            aggregated_text[user] = " ".join(texts)

    # Compute overall sentiment for each user.
    user_sentiments = {}
    for user, text in aggregated_text.items():
        cleaned = clean_for_sentiment(text)
        user_sentiments[user] = get_sentiment(cleaned)

    # Batched zero-shot classification for ideology using aggregated texts.
    candidate_labels = ["pro-Israeli", "pro-Palestinian", "neutral"]
    user_list = []
    texts_list = []
    for user, text in aggregated_text.items():
        if text.strip():
            user_list.append(user)
            texts_list.append(text.strip())
    user_ideology = {}

    if texts_list:
        try:
            results = zero_shot_classifier(texts_list, candidate_labels=candidate_labels, batch_size=batch_size)
        except Exception as e:
            print("Zero-shot classification error on aggregated texts:", e)
            results = [{"labels": ["neutral"]} for _ in texts_list]
        for i, res in enumerate(results):
            predicted = res.get("labels", ["neutral"])[0]
            user_ideology[user_list[i]] = predicted

    # Update node attributes with aggregated sentiment and ideology.
    for user in G.nodes():
        G.nodes[user]["avg_sentiment"] = user_sentiments.get(user, 0)
        G.nodes[user]["ideology"] = user_ideology.get(user, "neutral")

    # Build edges based on interactions.
    # For each submission in user_activity, assume it includes a list "comment_authors".
    for user, activities in user_activity.items():
        for sub in activities.get("submissions", []):
            comment_authors = sub.get("comment_authors", [])
            for commenter in comment_authors:
                if commenter != user:
                    if G.has_edge(user, commenter):
                        G[user][commenter]["weight"] += 1
                    else:
                        G.add_edge(user, commenter, weight=1)

    return G

In [None]:
#########################
# Process All Months (Saving Data After Each Month)
#########################
def process_all_months_networks(data_dir, months, output_dir, drive_output_dir, save_each_month=True):
    """
    Process all months, building monthly social networks.
    For each month, if processed files already exist in drive_output_dir,
    load them; otherwise, process and save.

    Returns:
      - monthly_networks: Dictionary mapping month_str -> updated social network graph.
    """

    monthly_networks = {}

    for month_str in tqdm(months, desc="Processing months"):
        tqdm.write(f"Processing month: {month_str}")

        # Define expected output file paths for this month in both local and Drive directories.
        network_file_local = os.path.join(output_dir, f"social_network_{month_str}.gpickle")
        network_file_drive = os.path.join(drive_output_dir, f"social_network_{month_str}.gpickle")


        # Check if processed files exist in drive
        if os.path.exists(network_file_drive):
            with open(network_file_drive, "rb") as f:
                G = pickle.load(f)
            print(f"Loaded processed data from Drive for month: {month_str}")

            # if file exsists load it to local
            if not os.path.exists(network_file_local):
                shutil.copy(network_file_drive, network_file_local)

        # Process the month since no processed file exists.
        else:
            G= build_social_network_from_user_stats(month_str, data_dir, batch_size=32)
            # update_graph_node_attributes(G, contributions)
            print(f"Processed data for month: {month_str}")

            # Now, save the updated graph and contributions to both local and Drive directories.
            if save_each_month:
                with open(network_file_local, "wb") as f:
                    pickle.dump(G, f)

                with open(network_file_drive, "wb") as f:
                    pickle.dump(G, f)
                print(f"Updated and saved data for month: {month_str}")

        monthly_networks[month_str] = G

    return monthly_networks

In [None]:
# Process all months and save each month's data separately (or load if already processed).
monthly_networks  =  process_all_months_networks(data_dir, months, output_dir, drive_output_dir, save_each_month=True)

Processing months:   0%|          | 0/14 [00:00<?, ?it/s]

Processing month: 2023-09


Processing months:   7%|▋         | 1/14 [00:01<00:16,  1.29s/it]

Loaded processed data from Drive for month: 2023-09
Processing month: 2023-10


Processing months:  14%|█▍        | 2/14 [00:02<00:13,  1.11s/it]

Loaded processed data from Drive for month: 2023-10
Processing month: 2023-11


Processing months:  21%|██▏       | 3/14 [00:03<00:10,  1.00it/s]

Loaded processed data from Drive for month: 2023-11
Processing month: 2023-12


Processing months:  29%|██▊       | 4/14 [2:04:01<8:09:32, 2937.25s/it]

Processed data for month: 2023-12
Updated and saved data for month: 2023-12
Processing month: 2024-01


Processing months:  36%|███▌      | 5/14 [3:24:59<9:04:27, 3629.74s/it]

Processed data for month: 2024-01
Updated and saved data for month: 2024-01
Processing month: 2024-02


Processing months:  43%|████▎     | 6/14 [4:35:54<8:32:18, 3842.26s/it]

Processed data for month: 2024-02
Updated and saved data for month: 2024-02
Processing month: 2024-03


Processing months:  50%|█████     | 7/14 [5:50:17<7:51:56, 4045.20s/it]

Processed data for month: 2024-03
Updated and saved data for month: 2024-03
Processing month: 2024-04


Processing months:  57%|█████▋    | 8/14 [7:08:28<7:05:04, 4250.80s/it]

Processed data for month: 2024-04
Updated and saved data for month: 2024-04
Processing month: 2024-05


Processing months:  64%|██████▍   | 9/14 [8:33:32<6:16:28, 4517.75s/it]

Processed data for month: 2024-05
Updated and saved data for month: 2024-05
Processing month: 2024-06


Processing months:  71%|███████▏  | 10/14 [9:38:52<4:48:52, 4333.16s/it]

Processed data for month: 2024-06
Updated and saved data for month: 2024-06
Processing month: 2024-07


Processing months:  79%|███████▊  | 11/14 [10:29:40<3:16:59, 3939.82s/it]

Processed data for month: 2024-07
Updated and saved data for month: 2024-07
Processing month: 2024-08


Processing months:  86%|████████▌ | 12/14 [11:22:44<2:03:39, 3709.80s/it]

Processed data for month: 2024-08
Updated and saved data for month: 2024-08
Processing month: 2024-09


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing months:  93%|█████████▎| 13/14 [12:23:45<1:01:35, 3695.17s/it]

Processed data for month: 2024-09
Updated and saved data for month: 2024-09
Processing month: 2024-10


Processing months: 100%|██████████| 14/14 [13:34:05<00:00, 3489.00s/it]

Processed data for month: 2024-10
Updated and saved data for month: 2024-10





In [None]:
# Save the aggregated monthly networks (if desired, they are already saved per month)
for month, G in monthly_networks.items():
    nx.write_gpickle(G, os.path.join(output_dir, f"social_network_{month}.gpickle"))

In [None]:
print("Preprocessing complete. Processed contributions & monthly networks are saved in the 'time_sires_analysis' folder and ready for further analysis.")