# ABSA on Reddit Comments
- This will look at using the Reddit parent post as context for each comment

In [1]:
## Load libraries
from langchain.llms.openai import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.chains import SequentialChain
import openai
from getpass import getpass
import os
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

In [2]:
nsc_posts = pd.read_csv('../Reddit Data/nsc_posts.csv')
nsc_comments = pd.read_csv('../Reddit Data/nsc_comments.csv')

In [3]:
nsc_posts.head()

Unnamed: 0.1,Unnamed: 0,id,Title,Content,Author,Post Date
0,0,ut4efu,I played the guitar riff yesterday!,,Grace-Music,2022-05-19 14:16:19
1,1,n3x5zx,Took my daughter to her first MLS game yesterd...,,JiuManji,2021-05-03 14:15:03
2,2,jy4kdq,FIRST PLAYOFF WIN UPVOTE PARTY!!,LETS GO!! What a Game!!!,BigBlueNate33,2020-11-21 04:26:24
3,3,fbuyj5,Thought this guy deserved a shoutout,,fullthrottle13,2020-03-01 14:44:32
4,4,k0isnq,ANOTHER PLAYOFF WIN UPVOTE PARTY!!,LETS. FREAKING. GO!!!!!!! MASSIVE CLUB!!! Semi...,BigBlueNate33,2020-11-25 01:56:02


In [4]:
nsc_comments.head()

Unnamed: 0.1,Unnamed: 0,Comment ID,Parent Comment ID,Text,Author,Date,Post ID
0,0,i97ewgh,,It was fantastic! You crushed it!,DirtyFlip,2022-05-19 14:26:04,ut4efu
1,1,i97eye1,,Sounded great!,Cerebralflea,2022-05-19 14:26:27,ut4efu
2,2,i97fvs9,,you did a great job!!,trillwilly69,2022-05-19 14:33:10,ut4efu
3,3,i97llqq,,Amazing job last night. Hope you can come bac...,jasonlp03,2022-05-19 15:13:11,ut4efu
4,4,i97o17n,i97llqq,Me too!,Grace-Music,2022-05-19 15:29:57,ut4efu


In [5]:
class Post:
    def __init__(self, post_id, title, content):
        self.post_id = post_id
        self.title = title
        self.content = content
        self.comments = []

class Comment:
    def __init__(self, comment_id, text, post_id):
        self.comment_id = comment_id
        self.text = text
        self.post_id = post_id  # Store the post_id
        self.parent_comment = None
        self.replies = []

# Create dictionaries to map post IDs to Post objects and comment IDs to Comment objects.
posts_dict = {}
comments_dict = {}

# Populate posts_dict and comments_dict from your dataframes.
for post_row in nsc_posts.itertuples():
    post_id = post_row.id
    title = post_row.Title
    content = post_row.Content
    post = Post(post_id, title, content)
    posts_dict[post_id] = post


comments_df = nsc_comments.rename(columns={
    'Comment ID': 'Comment_ID',
    'Parent Comment ID': 'Parent_Comment_ID',
    'Text': 'Text',
    'Author': 'Author',
    'Date': 'Date',
    'Post ID': 'Post_ID'
})

# Now the columns have underscores instead of spaces, making it easier to access them.

# You can use the updated column names directly in your code as follows:
for comment_row in comments_df.itertuples():
    comment_id = comment_row.Comment_ID
    text = comment_row.Text
    post_id = comment_row.Post_ID  # Store the post_id
    comment = Comment(comment_id, text, post_id)
    comments_dict[comment_id] = comment

    # Assign parent comment if it exists.
    parent_comment_id = comment_row.Parent_Comment_ID
    if not pd.isna(parent_comment_id):
        parent_comment = comments_dict.get(parent_comment_id)
        if parent_comment:
            comment.parent_comment = parent_comment
            parent_comment.replies.append(comment)

## Notes
- This structure should theoretically allow us to reference what we need with if statements and go through posts then comments.
- This appears like it will rely heavily on prompt engineering.
- CONCERN: This seems like it might be pretty expensive. It'll be looping through all the comments for each post to check then again to check for replies.
- Might be best to use this structure to pass in an entire post and comments related to it and perform this in its entirety post by post

In [6]:
# Get the first post from the posts_dict
first_post_id, first_post = next(iter(posts_dict.items()))

print(f"Post ID: {first_post_id}")
print(f"Post Title: {first_post.title}")
print(f"Post Content: {first_post.content}")

print("Comments on the First Post:")
for comment in comments_dict.values():
    if comment.parent_comment is None and comment.post_id == first_post_id:
        # Check if the comment is a top-level comment on the first post
        print(f"  Comment ID: {comment.comment_id}")
        print(f"  Comment Text: {comment.text}")

        # Print replies to the comment
        for reply in comment.replies:
            print(f"    Reply ID: {reply.comment_id}")
            print(f"    Reply Text: {reply.text}")

Post ID: ut4efu
Post Title: I played the guitar riff yesterday!
Post Content: nan
Comments on the First Post:
  Comment ID: i97ewgh
  Comment Text: It was fantastic!  You crushed it!
  Comment ID: i97eye1
  Comment Text: Sounded great!
  Comment ID: i97fvs9
  Comment Text: you did a great job!!
  Comment ID: i97llqq
  Comment Text: Amazing job last night.  Hope you can come back and do it again soon.
    Reply ID: i97o17n
    Reply Text: Me too!
  Comment ID: i97gs0n
  Comment Text: Great job!!!
  Comment ID: i97gegb
  Comment Text: Badass
  Comment ID: i97hpil
  Comment Text: Sounded awesome!!
  Comment ID: i97rdeh
  Comment Text: Absolutely killed it also!
Where do you pull your inspiration for guitar?
  Comment ID: i97tznu
  Comment Text: Great job!
  Comment ID: i97u05w
  Comment Text: You crushed it!
  Comment ID: i97u5gk
  Comment Text: Absolutely crushed it!
  Comment ID: i97ulg7
  Comment Text: ABSOLUTELY SHREDDED
  Comment ID: i97ye6g
  Comment Text: Dude you killed. One of th

In [7]:
# Function to print comments and replies for a given post
def print_comments_for_post(post, comments_dict):
    print(f"Title: {post.title}")
    print(f"Content: {post.content}")

    print("Comments:")
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Print out the comment
            print(f"{indicator} Comment Text: {comment.text}")

# Iterate through all posts and print comments for each post
for post_id, post in posts_dict.items():
    print_comments_for_post(post, comments_dict)

Title: I played the guitar riff yesterday!
Content: nan
Comments:
T: Comment Text: It was fantastic!  You crushed it!
T: Comment Text: Sounded great!
T: Comment Text: you did a great job!!
T: Comment Text: Amazing job last night.  Hope you can come back and do it again soon.
R: Comment Text: Me too!
T: Comment Text: Great job!!!
T: Comment Text: Badass
T: Comment Text: Sounded awesome!!
T: Comment Text: Absolutely killed it also!
Where do you pull your inspiration for guitar?
T: Comment Text: Great job!
T: Comment Text: You crushed it!
T: Comment Text: Absolutely crushed it!
T: Comment Text: ABSOLUTELY SHREDDED
T: Comment Text: Dude you killed. One of the best ones yet.
T: Comment Text: Yeah you can shred that was the best one this year.
T: Comment Text: You absolutely killed it. Great work!
T: Comment Text: Hell yea! LFGNSC!!
T: Comment Text: Sounded awesome. Great job!
T: Comment Text: Great job and was fun to watch and listen to! Have a bright future in front of you!
T: Comment Text

In [8]:
# Function to get the full thread for a given post and its comments
def get_thread_for_post(post, comments_dict):
    thread = f"Title: {post.title}\nContent: {post.content}\n\nComments:\n"
    
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Add the comment to the thread
            thread += f"{indicator} Comment Text: {comment.text}\n"

    return thread

## Notes
- This indicates a top level comment with T and a reply with R.
- With proper prompt engineering, this could be passed in to look at posts as a whole.
- As long as we strictly tell the model how the structure works, it should be able to do sentiment analysis on the post as a whole.
- If not, we should be able to iterate through the dictionaries and just do it seperately

## ABSA Pipeline With Full Convo

In [9]:
# OpenAI API Key

OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAI(temperature = 0.7)

In [27]:
examples = [{
    "thread": '''
    Title: First NSC game!
    Content: Went to my first game this weekend and had so much fun! The crowd made the game so exiting and the team played so well. The goal scored by Hany was unreal! Are all games this fun?
    Comments: 
    T: Comment Text: Glad you had fun! Games are always fun, but definitely better when we win. We just gotta hope the team and coaches keep it up!
    T: Comment Text: It was my first game too. The stadium was so much cooler than I ever expected. The food was way too expensive though!
    R: Comment Text: I agree, prices for food are way too high! Only worth it when we are winning!
    ''',
    "aspects": "Relevant Aspects are team performance, pricing, and Geodis Park."
},
{
    "thread": '''
    Title: New coach?
    Content: I have been disappointed with our coaching lately. I feel as if Smith is not getting the most out of the players. Is it wrong for me to want to see change?
    Comments: 
    T: Comment Text: I agree it hasn't been up to our standards, but I think we have to have faith and trust Coach Smith!
    R: Comment Text: I disagree, he should be fired.
    T: Comment Text: Sometimes coaches can be used as a scapegoat. We need to give it a bit longer and let the players gel together in this system.
    T: Comment Text: Trust the process. Our coaches and players will bring it together and we can make a playoff push. It is our job as fans to pack Geodis and help motivate them to perform!
    R: Comment Text: While I agree with packing Geodis, shouldn't they be motivated already?
    R: Comment Text: The crowd has been quiet recently. Let's get loud and encourage our boys!
    ''',
    "aspects": "Relevant Aspects are team performance, coaching, and Geodis Park."
}]

prompt_template = '''
Thread: {thread}
{aspects}
'''

example_prompt = PromptTemplate(input_variables = ["thread", "aspects"], template = prompt_template)

In [28]:
final_prompt = FewShotPromptTemplate(
    examples = examples,
    example_prompt = example_prompt,
    suffix = "Thread: {thread}\n",
    input_variables = ["thread"],
    prefix = '''
    I am extracting aspects from a Reddit Thread made by Nashville SC fans. Nashville SC (NSC) is a soccer team that plays at their stadium: Geodis Park in Nashville, Tennessee. Their roster consists of Joe Willis, Daniel Lovitz, Lukas MacNaughton, Nick DePuy, Jack Maher, Dax McCarty (captain), Fafà Picault, Randall Leal, Sam Surridge, Hany Mukhtar, Ethan Zubak, Teal Bunbury, Joey Skinner, Jacob Shaffelburg, Laurence Wyke, Shaq Moore, Alex Muyl, Aníbal Godoy, Ahmed Longmire, Josh Bauer, Taylor Washington, Walker Zimmerman, Luke Haakenson, Brian Anunga, Nebiyou Perry, Elliot Panicco, Sean Davis, Ben Martino, Adem Sipić, and Kemy Amiche. Their coach is Gary Smith. Any other player names that are given, assume they are on an opposing team.
For this conversational thread, please return a list of the following aspects of fan experience (if they are present in the text): NSC team performance, stadium amenities, coaching, pricing, stadium atmosphere, and media coverage. A note: stadium amenities include food and the gift shop.
The structure of the post will be as follows. Title is the general title made of the original post. Content is the text from the original post. Comments will be all comments on the post. Any comment labelled as "T: Comment Text:" is a top-level comment, so use the original post and content as the context for this comment. Any comment labelled as "R: Comment Text:" is a reply comment, so use all the comments above it until you hit a top-level comment, as well as the original post and content as the context for this comment. If the Content of a post is "nan" that means the post was an image. For any of these posts, just consider the post title and comments, ignoring the "nan" content.
Additionally, when the newline operator '\n' is present, this means that it is the end of that respective post section. This will work as a seperator to help distinguish posts from comments and so on.
    ''')

In [29]:
aspect_extraction_chain = LLMChain(llm = llm, prompt = final_prompt, output_key = 'aspects')

In [30]:
prompt_template2 = '''
Given below thread and the extracted aspects, tell me about the sentiment of those aspects. This sentiment should be on a continuous scale of -1 to 1, where -1 is the most negative, 0 is the most neutral, and 1 is the most postive. Round the score to 2 decimal places. Follow this format: (aspect, sentiment_score).
Thread: {thread}
Aspects: {aspects}
[(Aspect1, Sentiment_Score_1), (Aspect2, Sentiment_Score_2),.....]
'''

example_prompt2 = PromptTemplate(input_variables = ["thread", "aspects"], template = prompt_template2)

aspect_sentiment_chain = LLMChain(llm = llm, prompt = example_prompt2, output_key = "Aspects_with_sentiment")

In [31]:
overall_chain = SequentialChain(
    chains = [aspect_extraction_chain, aspect_sentiment_chain],
    input_variables = ["thread"],
    output_variables = ["Aspects_with_sentiment"],
    verbose = True
)

## Testing on 1st conversation

In [32]:
# Iterate through all posts and store threads for each post
threads = []
for post_id, post in posts_dict.items():
    thread = get_thread_for_post(post, comments_dict)
    threads.append(thread)

In [35]:
x = overall_chain({"thread": threads[0]})



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


In [36]:
x['Aspects_with_sentiment']

'\n("Played the guitar riff", 0.93), ("Crushed it", 0.93), ("Sounded great", 0.93), ("Great job", 0.93), ("Amazing job", 0.93), ("Badass", 0.85), ("Absolutely killed it", 0.93), ("Shredded", 0.93), ("Killed it", 0.93), ("Great work", 0.93), ("Sick", 0.93), ("Way to go", 0.93), ("Did great", 0.93), ("Awesome", 0.93), ("Great shred", 0.93)'

In [20]:
x = overall_chain({"thread": threads[1]})
x['Aspects_with_sentiment']



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


'(General sentiment, 0.94)'

In [18]:
import re

# Extract aspects_with_sentiment value
aspects_with_sentiment = x['Aspects_with_sentiment']

# Use regular expression to find all matches in the format (Aspect, Sentiment)
matches = re.findall(r'\(([^,]+), ([^)]+)\)', aspects_with_sentiment)

# Convert matches to list of tuples
result_list = [(match[0], float(match[1])) for match in matches]

print(result_list)

[('Stadium Atmosphere', 0.95), ('Stadium Amenities', 0.91), ('Media Coverage', 0.91)]


In [19]:
type(threads)

list

In [19]:
overall_chain({"thread": threads[2]})



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


{'thread': "Title: FIRST PLAYOFF WIN UPVOTE PARTY!!\nContent: LETS GO!! What a Game!!!\n\nComments:\nT: Comment Text: In preseason or early in the season, dax did an interview and was asked about his expectations for the team and the season. His response was basically that his goal was the MLS cup. I laughed at the time. I appreciated the optimism but also...let's be realistic here. Well...I'm not laughing anymore. No doubt we're a long shot away. But I'm not laughing.\nR: Comment Text: Dax played like a champ tonight. Defense as a whole was STOUT.\nT: Comment Text: YEET!!!\nR: Comment Text: YUH YUH YUH YUH YEEEEEEEEEEEEET\nT: Comment Text: WOOO!!!\nT: Comment Text: WOOHOO!!!\nT: Comment Text: BIG DUB\nT: Comment Text: I think I peed a little\nT: Comment Text: Wooooooo\nT: Comment Text: Fantastic win!  A statement game on national tv as “underdogs”\nT: Comment Text: [deleted]\nR: Comment Text: Fuck it, you’re hired\nT: Comment Text: Hooty HOO!!!!\nT: Comment Text: Good win last night!!

In [41]:
overall_chain({"thread": '''
Title: Are we doing well enough?
Content: I think the team has been outstanding overall, but I was dissapointed by our loss to Atlanta. 
T: Comment Text: I disagree, I think we were fine against Atlanta and great overall.
'''})



[1m> Entering new SequentialChain chain...[0m

[1m> Finished chain.[0m


{'thread': '\nTitle: Are we doing well enough?\nContent: I think the team has been outstanding overall, but I was dissapointed by our loss to Atlanta. \nT: Comment Text: I disagree, I think we were fine against Atlanta and great overall.\n',
 'Aspects_with_sentiment': '(NSC team performance, 0.67)'}

### Loop for all and storing output
- Won't run this until billing sorted

In [None]:
output = []
for thread in threads:
    output.append(overall_chain({"thread": thread}))
    # Eventually add in output parsing?