# Checking Context Lengths

In [1]:
## Load libraries
import os
import warnings
import pandas as pd
import re
import random
import cleantext
import time

warnings.filterwarnings("ignore")

## Titans

In [2]:
## Load data
tt_posts = pd.read_csv('../Reddit Data/tt_posts_clean.csv')
tt_comments = pd.read_csv('../Reddit Data/tt_comments_clean.csv')

In [14]:
## Load helper functions
class Post:
    def __init__(self, post_id, title, content):
        self.post_id = post_id
        self.title = title
        self.content = content
        self.comments = []

class Comment:
    def __init__(self, comment_id, text, post_id):
        self.comment_id = comment_id
        self.text = text
        self.post_id = post_id  # Store the post_id
        self.parent_comment = None
        self.replies = []

# Create dictionaries to map post IDs to Post objects and comment IDs to Comment objects.
posts_dict = {}
comments_dict = {}

# Populate posts_dict and comments_dict from your dataframes.
for post_row in tt_posts.itertuples():
    post_id = post_row.id
    title = post_row.Title
    content = post_row.Content
    post = Post(post_id, title, content)
    posts_dict[post_id] = post


comments_df = tt_comments.rename(columns={
    'Comment ID': 'Comment_ID',
    'Parent Comment ID': 'Parent_Comment_ID',
    'Text': 'Text',
    'Author': 'Author',
    'Date': 'Date',
    'Post ID': 'Post_ID'
})

# Now the columns have underscores instead of spaces, making it easier to access them.

# You can use the updated column names directly in your code as follows:
for comment_row in comments_df.itertuples():
    comment_id = comment_row.Comment_ID
    text = comment_row.Text
    post_id = comment_row.Post_ID  # Store the post_id
    comment = Comment(comment_id, text, post_id)
    comments_dict[comment_id] = comment

    # Assign parent comment if it exists.
    parent_comment_id = comment_row.Parent_Comment_ID
    if not pd.isna(parent_comment_id):
        parent_comment = comments_dict.get(parent_comment_id)
        if parent_comment:
            comment.parent_comment = parent_comment
            parent_comment.replies.append(comment)

# Function to get the full thread for a given post and its comments
def get_thread_for_post(post, comments_dict):
    thread = f"Title: {post.title}\nContent: {post.content}\n\nComments:\n"
    
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Add the comment to the thread
            thread += f"{indicator} Comment Text: {comment.text}\n"
            
    thread_no_urls = cleantext.replace_urls(thread, replace_with="<URL>")
    return thread_no_urls

In [15]:
## Store threads
tt_threads = []
for post_id, post in posts_dict.items():
    thread = get_thread_for_post(post, comments_dict)
    tt_threads.append(thread)

In [16]:
tt_word_counts = [len(post.split()) for post in tt_threads]
max(tt_word_counts)

30232

## NSC

In [6]:
## Load data
nsc_posts = pd.read_csv('../Reddit Data/nsc_posts_clean.csv')
nsc_comments = pd.read_csv('../Reddit Data/nsc_comments_clean.csv')

In [17]:
## Load helper functions
class Post:
    def __init__(self, post_id, title, content):
        self.post_id = post_id
        self.title = title
        self.content = content
        self.comments = []

class Comment:
    def __init__(self, comment_id, text, post_id):
        self.comment_id = comment_id
        self.text = text
        self.post_id = post_id  # Store the post_id
        self.parent_comment = None
        self.replies = []

# Create dictionaries to map post IDs to Post objects and comment IDs to Comment objects.
posts_dict = {}
comments_dict = {}

# Populate posts_dict and comments_dict from your dataframes.
for post_row in nsc_posts.itertuples():
    post_id = post_row.id
    title = post_row.Title
    content = post_row.Content
    post = Post(post_id, title, content)
    posts_dict[post_id] = post


comments_df = nsc_comments.rename(columns={
    'Comment ID': 'Comment_ID',
    'Parent Comment ID': 'Parent_Comment_ID',
    'Text': 'Text',
    'Author': 'Author',
    'Date': 'Date',
    'Post ID': 'Post_ID'
})

# Now the columns have underscores instead of spaces, making it easier to access them.

# You can use the updated column names directly in your code as follows:
for comment_row in comments_df.itertuples():
    comment_id = comment_row.Comment_ID
    text = comment_row.Text
    post_id = comment_row.Post_ID  # Store the post_id
    comment = Comment(comment_id, text, post_id)
    comments_dict[comment_id] = comment

    # Assign parent comment if it exists.
    parent_comment_id = comment_row.Parent_Comment_ID
    if not pd.isna(parent_comment_id):
        parent_comment = comments_dict.get(parent_comment_id)
        if parent_comment:
            comment.parent_comment = parent_comment
            parent_comment.replies.append(comment)

# Function to get the full thread for a given post and its comments
def get_thread_for_post(post, comments_dict):
    thread = f"Title: {post.title}\nContent: {post.content}\n\nComments:\n"
    
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Add the comment to the thread
            thread += f"{indicator} Comment Text: {comment.text}\n"
            
    thread_no_urls = cleantext.replace_urls(thread, replace_with="<URL>")
    return thread_no_urls

In [18]:
## Store threads
nsc_threads = []
for post_id, post in posts_dict.items():
    thread = get_thread_for_post(post, comments_dict)
    nsc_threads.append(thread)

In [19]:
nsc_word_counts = [len(post.split()) for post in nsc_threads]
max(nsc_word_counts)

10412

## Preds

In [10]:
## Load data
preds_posts = pd.read_csv('../Reddit Data/preds_posts_clean.csv')
preds_comments = pd.read_csv('../Reddit Data/preds_comments_clean.csv')

In [20]:
## Load helper functions
class Post:
    def __init__(self, post_id, title, content):
        self.post_id = post_id
        self.title = title
        self.content = content
        self.comments = []

class Comment:
    def __init__(self, comment_id, text, post_id):
        self.comment_id = comment_id
        self.text = text
        self.post_id = post_id  # Store the post_id
        self.parent_comment = None
        self.replies = []

# Create dictionaries to map post IDs to Post objects and comment IDs to Comment objects.
posts_dict = {}
comments_dict = {}

# Populate posts_dict and comments_dict from your dataframes.
for post_row in preds_posts.itertuples():
    post_id = post_row.id
    title = post_row.Title
    content = post_row.Content
    post = Post(post_id, title, content)
    posts_dict[post_id] = post


comments_df = preds_comments.rename(columns={
    'Comment ID': 'Comment_ID',
    'Parent Comment ID': 'Parent_Comment_ID',
    'Text': 'Text',
    'Author': 'Author',
    'Date': 'Date',
    'Post ID': 'Post_ID'
})

# Now the columns have underscores instead of spaces, making it easier to access them.

# You can use the updated column names directly in your code as follows:
for comment_row in comments_df.itertuples():
    comment_id = comment_row.Comment_ID
    text = comment_row.Text
    post_id = comment_row.Post_ID  # Store the post_id
    comment = Comment(comment_id, text, post_id)
    comments_dict[comment_id] = comment

    # Assign parent comment if it exists.
    parent_comment_id = comment_row.Parent_Comment_ID
    if not pd.isna(parent_comment_id):
        parent_comment = comments_dict.get(parent_comment_id)
        if parent_comment:
            comment.parent_comment = parent_comment
            parent_comment.replies.append(comment)

# Function to get the full thread for a given post and its comments
def get_thread_for_post(post, comments_dict):
    thread = f"Title: {post.title}\nContent: {post.content}\n\nComments:\n"
    
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Add the comment to the thread
            thread += f"{indicator} Comment Text: {comment.text}\n"
            
    thread_no_urls = cleantext.replace_urls(thread, replace_with="<URL>")
    return thread_no_urls

In [21]:
## Store threads
preds_threads = []
for post_id, post in posts_dict.items():
    thread = get_thread_for_post(post, comments_dict)
    preds_threads.append(thread)

In [22]:
preds_word_counts = [len(post.split()) for post in preds_threads]
max(preds_word_counts)

7784

In [24]:
# How many over 10,000
count_over = sum(1 for count in tt_word_counts if count > 10412)
count_over

9

In [25]:
idx_over = [i for i, count in enumerate(tt_word_counts) if count > 10412]
idx_over

[21, 42, 84, 121, 931, 1034, 1150, 1452, 1555]

In [37]:
tt_word_counts[1555]

11358