# ABSA on Preds Threads

!pip install clean-text

In [1]:
## Load libraries
from langchain.llms.openai import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.chains import SequentialChain
import openai
from getpass import getpass
import os
import warnings
import pandas as pd
import re
import random
import cleantext
import time

warnings.filterwarnings("ignore")

In [2]:
## Load data
preds_posts = pd.read_csv('../Reddit Data/preds_posts_clean.csv')
preds_comments = pd.read_csv('../Reddit Data/preds_comments_clean.csv')

In [3]:
## Load helper functions
class Post:
    def __init__(self, post_id, title, content):
        self.post_id = post_id
        self.title = title
        self.content = content
        self.comments = []

class Comment:
    def __init__(self, comment_id, text, post_id):
        self.comment_id = comment_id
        self.text = text
        self.post_id = post_id  # Store the post_id
        self.parent_comment = None
        self.replies = []

# Create dictionaries to map post IDs to Post objects and comment IDs to Comment objects.
posts_dict = {}
comments_dict = {}

# Populate posts_dict and comments_dict from your dataframes.
for post_row in preds_posts.itertuples():
    post_id = post_row.id
    title = post_row.Title
    content = post_row.Content
    post = Post(post_id, title, content)
    posts_dict[post_id] = post


comments_df = preds_comments.rename(columns={
    'Comment ID': 'Comment_ID',
    'Parent Comment ID': 'Parent_Comment_ID',
    'Text': 'Text',
    'Author': 'Author',
    'Date': 'Date',
    'Post ID': 'Post_ID'
})

# Now the columns have underscores instead of spaces, making it easier to access them.

# You can use the updated column names directly in your code as follows:
for comment_row in comments_df.itertuples():
    comment_id = comment_row.Comment_ID
    text = comment_row.Text
    post_id = comment_row.Post_ID  # Store the post_id
    comment = Comment(comment_id, text, post_id)
    comments_dict[comment_id] = comment

    # Assign parent comment if it exists.
    parent_comment_id = comment_row.Parent_Comment_ID
    if not pd.isna(parent_comment_id):
        parent_comment = comments_dict.get(parent_comment_id)
        if parent_comment:
            comment.parent_comment = parent_comment
            parent_comment.replies.append(comment)

# Function to get the full thread for a given post and its comments
def get_thread_for_post(post, comments_dict):
    thread = f"Title: {post.title}\nContent: {post.content}\n\nComments:\n"
    
    for comment_id, comment in comments_dict.items():
        if comment.post_id == post.post_id:
            if comment.parent_comment is None:
                indicator = "T:"  # Top-level comment indicator
            else:
                indicator = "R:"  # Reply indicator
            # Add the comment to the thread
            thread += f"{indicator} Comment Text: {comment.text}\n"
            
    thread_no_urls = cleantext.replace_urls(thread, replace_with="<URL>")
    return thread_no_urls

In [4]:
## Set up OpenAI Key
OPENAI_API_KEY = getpass()
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")
llm = OpenAI(model_name="gpt-3.5-turbo-1106")

In [5]:
# FEWER EXAMPLES
## Aspect Extraction Chain
examples = [{
    "thread": '''
    Title: the lack of fundamentals on this team is astounding
    Content: to me the most shocking thing is how bad this team is top to bottom when it comes to fundamentals. awful decision making, penalties, etc. we’re awful across the board. it’s one thing to not have much talent, but how could a team under Brunette be so bad at the fundamentals? our older teams had great fundamentals. 
    
    what happened? this season had really put Brunette's actual coaching skills in a terrible light imo.
    Comments: 
    T: Comment Text: It starts with coaching and buy in. Brunette has lost most of the team IMO. They have no real leaders. Most guys will be gone in 2 years and they know it. A bunch of rouge players not giving a shit.

    They don't trust the Brunette way anymore.
    R: Comment Text: Moulin Rouge?
    T: Comment Text: There is nowhere else to look other than Brunette. The justification to keep him around is that he should be able to field a disciplined team that gives effort and makes smart plays even if there is a talent disparity.

    With how unprofessional and unprepared the team has been the last few years I’m not sure what coaching “advantage” he’s giving you anymore.
    R: Comment Text: Trotz has to get rid of Brunette. Period
    R: Comment Text: It's time. 2-7.

    Who can this team beat in the remaining schedule? The Sharks??
    ''',
    "aspects": "Relevant aspects are 'team performance', 'coaching and management'"
},
{
    "thread": '''
    Title: Am I the only one that thinks espn hates talking abt the preds
    Content:  I swear they talk abt everyone we beat and any other game that was on or abt to play rather than giving us any credit. From what I’ve seen they don’t want us here and they try to play down what we’ve done and have given us no credit saying that the teams we’ve beat didn’t play good or we got lucky. Let me know your thoughts
    Comments: 
    T: Surprise! We are a small market team. It’s been this way literally forever.
    R: How is Nashville a small market team when there are 1.5 million people in the city on any given day?
    R: Because half of them aren’t fans/from Nashville
    T: I think Brunette went over to ESPN and bullied them all to keep the talking about us to a minimum, to help our boys stay focused and keep that hungry underdog mentality!
    T: I just enjoy that they are talking. And I don’t think they “hate” us. Just a small market team. Hard to satisfy a national audience with a small amount of fans. They love Forsberg and the heart of the team.
    T: ESPN is a known bias propagandizing network . I've switched to fox more and more this year. They giving preds love.
    T: People need to understand the audience isn’t the same for ESPN anymore. Its not like it used to be where every man in the world watches ESPN. A lot of the higher quality viewers are just using the internet or their phones now to look up high lights. As a result I feel these sports shows have to be more entertaining to get ratings, and as a result they just talk about the popular teams. People this is entertainment. They aren’t out here to give a fair or honest assessment.
    R: Chicken and egg with that one. I stopped watching when ESPN became the Lebron/Tiger/Brady channel.
    ''',
    "aspects": "Relevant aspects are 'media coverage'"
},
{
   "thread": '''
    Title: Flyers Fans visiting the area looking for the best meal in Bridgestone Arena
    Content:  Hey all, me and a bunch of friends are visiting Nashville for a bachelor party and surprising the married man to be with tickets to the game Sunday. Seeing as we're probably gonna be hungover on death's door, what's the best meal to munch on in Bridgestone?
    
    If you can drop a name and the section in the comments I'll love you forever.
    Comments: 
    T: I don’t care what anyone says, the walking taco bowl down near the main entrance always slaps. Especially if you’re a couple beers deep.
    T: If you’re dead set on eating at Bridgestone the BBQ nachos are my guilty pleasure.
    T: The grilled cheese sandwiches (Ground floor, just off to the left of the entrance) absolutely fucking slap. I would get one right away as they can take a while, but god damn are they good.
    ''',
    "aspects": "Relevant aspects are 'stadium amenities'"
},
{
    "thread": '''
    Title: Make Bridgestone loud again!
    Content:  This is a call to arms for all fans going to the game tonight! I know things have looked bleak as of late, but we need to remind our boys of our support, and that they shouldn’t dread playing at home for us!
    
    So even if we get into a big hole, let’s all band together and be leaders in the crowd tonight and cheer our team into a win! Never give up!
    Comments: 
    T: When they jacked up the prices the crowd noise quieted down immediately. There was a precipitous drop the season after the playoff run.
    R: Prices were going up after the playoff run anyway. The only question was whether the increase would go to the Predators or to the ticket brokers.
    T: Ridiculously expensive tickets have kept the most loyal and loudest fans home.
    T: I was literally sitting at Wicked Weed watching the prices fall as we got closer to game time because I was having this exact conversation. It’s not the first time the tickets have hit $10. I posted a screenshot with a $10 lower bowl pair a couple weeks ago.
    R: $10 lower bowl seats? PM me if you ever see that again lol
    R: $20 lower bowl seats tonight. $10 in the upper bowl. Shame all the real fans are being priced out though.
    ''',
    "aspects": "Relevant aspects are 'stadium atmosphere', 'pricing'"
}]

prompt_template = '''
Thread: {thread}
{aspects}
'''

example_prompt = PromptTemplate(input_variables = ["thread", "aspects"], template = prompt_template)

final_prompt = FewShotPromptTemplate(
    examples = examples,
    example_prompt = example_prompt,
    suffix = "Thread: {thread}\n",
    input_variables = ["thread"],
    prefix = '''
    I am extracting aspects from a Reddit Thread made by Nashville Predators fans. The Nashville Predators are a hockey team that play at their stadium: Bridgestone Arena in Nashville, Tennessee. Their coach is Andrew Brunette, their general manager is Barry Trotz, and some of their key players are Juuce Saros, Roman Josi, Ryan O'Reilly, Filip Forsberg, Tyson Barrie, Alexandre Carrier, Jeremy Lauzon, Colton Sissons, Kevin Lankinen, Cole Smith, Kiefer Sherwood, Ryan McDonagh, and Gustav Nyquist. Any other names that are given, assume they are on an opposing team.
    For this conversational thread, please return a list of the following aspects of fan experience that are present in the thread: 'team performance' (specifically the Predators, ignore discussion about other teams), 'stadium amenities' (including food and the gift shop), 'coaching and management', 'pricing', 'stadium atmosphere' (including comfort, safety, and crowd atmosphere), 'media coverage', and a 'miscellaneous' category. The 'miscellaneous' category should be returned if there is general conversation that is not covered by the other aspects.
    The structure of the post will be as follows: Title is the general title made of the original post. Content is the text from the original post. Comments will be all comments on the post. Any comment labelled as "T: Comment Text" is a top-level comment, so use the original post and content as the context for this comment. Any comment labelled as "R: Comment Text" is a reply comment, so use all the comments above it until you hit a top-level comment, as well as the original post and content as the context for this comment. If the Content of a post is "nan", that means the post was an image. For any of these posts, just consider the post title and comments, ignoring the "nan" content.
    ''')

aspect_extraction_chain = LLMChain(llm = llm, prompt = final_prompt, output_key = 'aspects')

In [6]:
## Sentiment Analysis Chain
prompt_template2 = '''
The following text is a Reddit thread and the list of aspects extracted from that thread. The aspects will be in the format ['aspect1', 'aspect2'...]. For each aspect in the list, return a sentiment score. Do NOT create aspects of your own; only calculate scores for the aspects provided. This sentiment score should be on a continuous scale from -1 to 1, where -1 is the most negative sentiment, 1 represents the most postive sentiment, and relatively neutral sentiments fall in the range -0.3 to 0.3. Round the score to 2 decimal places. Your output should follow this format: [(Aspect1, Sentiment_Score_1), (Aspect2, Sentiment_Score_2),.....].
Thread: {thread}
Aspects: {aspects}
[(Aspect1, Sentiment_Score_1), (Aspect2, Sentiment_Score_2),.....]
'''

example_prompt2 = PromptTemplate(input_variables = ["thread", "aspects"], template = prompt_template2)

aspect_sentiment_chain = LLMChain(llm = llm, prompt = example_prompt2, output_key = "Aspects_with_sentiment")

In [7]:
## Full Sequential Chain
overall_chain = SequentialChain(
    chains = [aspect_extraction_chain, aspect_sentiment_chain],
    input_variables = ["thread"],
    output_variables = ["thread", "aspects", "Aspects_with_sentiment"],
    verbose = False
)

In [8]:
## Store threads
threads = []
for post_id, post in posts_dict.items():
    thread = get_thread_for_post(post, comments_dict)
    threads.append(thread)

In [10]:
output = []

In [11]:
i = 0
## Run on threads
## Note: won't store in output, need to learn how to store on to DF
for thread in threads:
    if i % 5 == 0:
        time.sleep(60)
    if i % 100 == 0:
        print(i)
    res = overall_chain({"thread": thread})
    temp = res["Aspects_with_sentiment"]
    matches = re.findall(r'\(([^,]+), ([^)]+)\)', temp)
    result_list = [(match[0], float(match[1])) for match in matches]
    output.append(result_list)
    i += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [19]:
## Storing results
ex = preds_posts
ex['results'] = output

# Replace any quotation marks and force all lowercase
ex['cleaned_results'] = ex['results'].apply(lambda x: [(aspect.lower().replace("'", "").replace('"', ''), sentiment) for aspect, sentiment in x])

# List of aspects we want to find
valid_aspects = ['team performance', 'miscellaneous', 'coaching and management', 'stadium atmosphere', 'pricing', 'stadium amenities', 'media coverage']

# Filter the 'cleaned_results' column based on the list of valid aspects
junk = ex['cleaned_results'].apply(lambda x: [(aspect, sentiment) for aspect, sentiment in x if aspect not in valid_aspects])
ex['cleaned_results'] = ex['cleaned_results'].apply(lambda x: [(aspect, sentiment) for aspect, sentiment in x if aspect in valid_aspects])

# Remove any empty lists
ex = ex[ex['cleaned_results'].apply(lambda x: len(x) > 0)]

In [20]:
# Distribution of aspects
aspect_counts = ex['cleaned_results'].explode().apply(lambda x: x[0]).value_counts()

# Display the aspect counts
print(aspect_counts)

cleaned_results
team performance           15
miscellaneous              12
coaching and management     8
stadium atmosphere          7
pricing                     4
stadium amenities           3
media coverage              3
Name: count, dtype: int64


In [21]:
# Averaging sentiment scores
# Explode the 'cleaned_results' column to have one row for each aspect-sentiment pair
df_exploded = ex.explode('cleaned_results')

# Extract aspect and sentiment into separate columns
df_exploded[['aspect', 'sentiment']] = pd.DataFrame(df_exploded['cleaned_results'].tolist(), index=df_exploded.index)

# Calculate the average sentiment score for each aspect
average_sentiment = df_exploded.groupby('aspect')['sentiment'].mean()

# Display the average sentiment scores
print(average_sentiment)

aspect
coaching and management   -0.051250
media coverage             0.080000
miscellaneous              0.073333
pricing                   -0.137500
stadium amenities          0.120000
stadium atmosphere         0.160000
team performance           0.151333
Name: sentiment, dtype: float64


In [22]:
junk

0                                []
1                                []
2                                []
3                                []
4                                []
5                                []
6                                []
7                                []
8                                []
9                                []
10                               []
11                               []
12                               []
13                               []
14                               []
15                               []
16                               []
17                               []
18                               []
19                               []
20                               []
21                               []
22                               []
23    [(trade negotiations, -0.02)]
24                               []
Name: cleaned_results, dtype: object