# Echo Chamber Index (ECI) Analysis

## Project Overview  

This project measures the presence of echo chambers in an online community.  
We build a user–user graph from interactions, apply sentiment analysis, and compute an **Echo Chamber Index (ECI)** using network metrics (homogeneity, assortativity, modularity, sentiment skew).  
Visualisations highlight community structure and sentiment patterns.

## Notebook 1: Data Collection & Sentiment Analysis

In [87]:
import praw
import pandas as pd
import numpy as np
import emoji
import unicodedata

from transformers import AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer, util
import spacy
import torch
import matplotlib.pyplot as plt

from tqdm import tqdm


reddit = praw.Reddit("project")
header = {"User-Agent": "my-bot/0.0.1 by u/Karmz0a"}

### Extracting Reddit Comment Data (Ignoring Stickied & Empty Posts)

In [88]:
topic = input()

subreddit = reddit.subreddit(topic)
count = 200 if subreddit.subscribers < 1000000 else 100

non_sticky = []
for submission in subreddit.hot(limit = (count + 50)):
    if submission.stickied or submission.num_comments == 0: #Filters pinned posts (usually not about the subreddit topic)
        continue
    else:
        non_sticky.append(submission)
        if len(non_sticky) == count:
            break

data = []
for submission in non_sticky:
    
    submission.comments.replace_more(limit = None)
    commentList = submission.comments.list()
    for comment in commentList:
        data.append({
            "author": str(comment.author) if comment.author else "[deleted]",
            "original post" : submission.id,
            "upvotes": comment.score,
            "body": emoji.demojize(comment.body, delimiters=(":", ":")).replace("\n", " ").replace("\r", " "),
        })

### Data Cleaning & log scaling upvotes

In [89]:
commentFrame = pd.DataFrame(data)

commentFrame = commentFrame[(commentFrame["author"] != "AutoModerator") & 
                            (commentFrame["author"].notna()) &
                            (commentFrame['author'] != "[deleted]") &
                            (commentFrame["body"] != "[removed]")
                            ].drop_duplicates()

commentFrame["body"] = commentFrame["body"].apply(lambda x: unicodedata.normalize("NFC", x ))

conditions = [
    commentFrame["upvotes"] == 0,
    commentFrame["upvotes"] > 0,
    commentFrame["upvotes"] < 0
]
choices = [
    0,
    np.log10(commentFrame["upvotes"]),
    -np.log10(-commentFrame["upvotes"])
]

commentFrame["upvoteScale"] = np.select(conditions, choices, default = np.nan)



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


### Using Spacy to find relevant topics discussed

In [90]:
if torch.cuda.is_available():
    device = torch.device("cuda")   # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = torch.device("mps")    # Apple Silicon GPU (Metal Performance Shaders)
else:
    device = torch.device("cpu")    # fallback
nlp = spacy.load("en_core_web_sm", disable= ["parser", "tagger"])
embedder = SentenceTransformer("all-MiniLM-L6-v2", device = device)

spacy_entities = ["ORG", "PRODUCT", "PERSON", "WORK_OF_ART", "NORP", "GPE", "LOC", "LAW", "EVENT", "FAC"] # make this multiple choice for the user
entity = []
for doc in nlp.pipe(commentFrame["body"], batch_size= 100, n_process=-1):
    for ent in doc.ents:
        entity.append((ent.text, ent.label_))
        


df_entities = pd.DataFrame(entity, columns=["entity", "label"])
e = df_entities[df_entities["label"].isin(spacy_entities)].value_counts().head(18)
top_ent = [idx[0] for idx in e.index]






### Filtering out comments not relevant to the subreddits topics

In [91]:
comments = commentFrame["body"].tolist()
query_emb =  embedder.encode(top_ent, normalize_embeddings= True)
comment_emb = embedder.encode(comments, normalize_embeddings= True, show_progress_bar= True, batch_size= 64)

relevance = embedder.similarity(query_emb, comment_emb)
best_score = torch.max(relevance, dim = 0).values.tolist()

relevance = embedder.similarity(query_emb, comment_emb)
best_score = torch.max(relevance, dim = 0).values.tolist()
commentFrame["relevance"] =  best_score

cutoff = np.percentile(commentFrame["relevance"], 10)
commentFrame = commentFrame[commentFrame["relevance"] > cutoff]

Batches: 100%|██████████| 224/224 [00:15<00:00, 14.13it/s]


### Sentiment Analysis

In [92]:
comments = commentFrame["body"].tolist()
if torch.cuda.is_available():
    device = torch.device("cuda:0")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")  
classifier = pipeline("sentiment-analysis", model = "AG6019/reddit-comment-sentiment-final", device = device)

label_map = {"LABEL_0": -1, "LABEL_1": 1}

batch_size = 64 if device.type != "cpu" else 8
results = []

for i in tqdm(range(0, len(comments), batch_size), desc = "sentiment analysis"):
    batch = comments[i:i+batch_size]
    sentiment = classifier(batch, truncation = True, max_length = 128)
    results.extend(sentiment)

commentFrame["sentimentLabel"] = [0 if r["score"] < 0.85 else label_map[r["label"]] for r in results]
commentFrame["sentimentScore"] = [r["score"] for r in results]


Device set to use mps
sentiment analysis: 100%|██████████| 201/201 [02:27<00:00,  1.36it/s]


Saving Dataframe to use in other notebooks

In [93]:
commentFrame.to_pickle("comments.pkl")