This Notebook walks you through the steps used for the Baby Reindeer paper. 
# Table of Content
1. Preprocessing and Cleaning of Data
2. Data Enrichment


In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
import os
from dotenv import load_dotenv
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from detoxify import Detoxify
import random
import re
import json

## 1. Preprocessing and Cleaning of Data
In this section, we used the text column to create variables such as user name, episode, etc. We also add an ID varaible and create a variable that links a comment to it's parent comment

In [None]:
#Read main df that has everything
df_all = pd.read_csv('data/bbreindeer.csv')
print(df_all.shape)
df_all.head()

from text, we should be able to extract the comment number (first number surrounded by white space), the user name (first string of characters and numbers, surrounded by /t again), and the comment itself (the rest of the text).

In [None]:
#in text, get the first element as string that is contained by \t
def get_first_element(text):
    return text.split('\t')[1]

#in text, get the second element as string that is contained by \t
def get_second_element(text):
    return text.split('\t')[2]

#in text, get the third element as string that is contained by \t
def get_third_element(text):
    return text.split('\t')[3]

df_all['Comment_level'] = df_all['text'].apply(get_first_element)
df_all['User_name'] = df_all['text'].apply(get_second_element)
df_all['Comment'] = df_all['text'].apply(get_third_element)

df_all.head()

Now lets add an id variable as well as a parent id variable, in case a comment is a response to a different comment. In such cases we would like to be able to link the comment back to the parent comment.

In [None]:
#add an id number to it, starting from 100
df_all['id'] = range(100, 100+len(df_all))
#add an indicator variable to which comment the current comment is replying to
df_all['parent_id'] = None

# Iterate through the DataFrame to assign parent_ids
for idx in df_all.index:
    current_level = df_all.at[idx, 'Comment_level']
    # Scan previous rows to find the parent comment
    for j in range(idx - 1, -1, -1):
        if df_all.at[j, 'Comment_level'] < current_level:
            df_all.at[idx, 'parent_id'] = df_all.at[j, 'id']
            break
        
#save as clean master file
df_all.to_csv('data/bbreindeer_clean.csv', index=False)

## 2. Data Enrichment
There are various columns we need to create to make the data more useful.
* Is Martha/Fiona Mentioned
* Is Donny mentioned
* General sentiment of the comment


### 2.1 Is Martha/Fiona Mentioned?
This will be a simple binary column, 1 if Martha or Fiona is mentioned, 0 if not. The mention can either be explicit (using her name) or implicit (using pronouns like she, her, or name calling. etc.) The explicit mentions are pretty easy to do, the implicit mentions will be harder, as we will have to use GPT to figure out who the comment is talking about.

We will 
* First look for explicit mentions of Martha
* Then look for implcit mentions of a female
* then use GPT to classify the comments that are not explicitly mentioning martha, but are implicitly mentioning a female --> the response should be is this about martha, terry, other, or unsure. 
* then we can collaps the variables to a simple binary martha variable. 

In [None]:
df_martha = df_all
martha_words = ['martha', 'fiona']
df_martha['martha_explicit'] = df_martha['Comment'].apply(lambda x: any(word in x.lower() for word in martha_words))
df_martha['martha_explicit'] = df_martha['martha_explicit'].astype(int)
df_martha.head()

In [None]:
#lets check how often she is mentioned explicitly
df_martha['martha_explicit'].value_counts()

She is being mentioned explicitly 133 times, lets see how often she is being mentioned indirectly.

In [None]:
indirect_words = ['her', 'she', 'stalker']
pattern = r'\b(?:' + '|'.join(indirect_words) + r')\b'

df_martha['indirect_female'] = df_martha['Comment'].apply(lambda x: bool(re.search(pattern, x.lower())))
df_martha['indirect_female'] = df_martha['indirect_female'].astype(int)

In [None]:
#lets check how often we have indirect mentions, without having martha present
print(df_martha[(df_martha['indirect_female'] == 1) & (df_martha['martha_explicit'] == 0)].shape[0])
print(df_martha[(df_martha['indirect_female'] == 1)].shape[0])

We have 235 indirect female mentions, in 171 of these, Martha is not mentioned directly as well, so we gotta figure out whether those are talking about martha or not. For this we will have GPT assist us with the classification. 


In [None]:
prompt_1 = """
Your task is to evaluate whether a reddit comment is about the main character, Martha, or not. Martha is an overweight stalker of Donny. Martha also claims to be a lawyer. 
Another character who this comment might be refering to is Teri, who is transexual, and a therapist by profession. Your job is to figure out whether the comment referes to Martha, or whether it refers to someone else (such as Teri). 
The comment in question is: ###{}####. """

prompt_2 = """This comment is a reply to the following comment: ###{}###.
""" 

prompt_3 = """Respond with 1 if this comment is about martha, respond 2 if it is not, respond 3 if you are unsure. Only respond with the number. """

In [None]:
def prompt_creator(row):
    comment = row['Comment']
    #check the comment_level
    if row['Comment_level'] == 0:
        prompt = prompt_1.format(comment) + prompt_3
        return prompt
    else:
        parent_id = row['parent_id']
        parent_comment = df_martha[df_martha['id'] == parent_id]['Comment'].values[0]
        prompt = prompt_1.format(comment) + prompt_2.format(parent_comment) + prompt_3
        return prompt

In [None]:
#load api key and initialize client
load_dotenv()
api_key = os.getenv("API_KEY_ME")

client = OpenAI(api_key = api_key)

In [None]:
def run_GPT(prompt, model = 'gpt-4o', temperature = 0):
    """
    Runs the specified GPT model with the given prompt and temperature.

    Args:
        prompt (str): The prompt for the GPT model.
        model (str, optional): The model to use. Defaults to 'gpt-4o'.
        temperature (float, optional): The temperature parameter for the GPT model. Defaults to 0.

    Returns:
        str: The response text from the GPT model.
    """
  #other models are gpt-4-turbo, gpt-4, gpt-3.5-turbo-16k


  #get the response
    response = client.chat.completions.create(
    model=model,
    messages=[{"role": "user", 
               "content": prompt}],
    temperature = temperature
  )

  #get the response
    response_text = response.choices[0].message.content
    return response_text  

In [None]:
df_martha['GPT_result'] = None
for index, row in df_martha.iterrows():
    #check if the comment needs classification
    needs_classification = False
    if row['martha_explicit'] == 0 and row['indirect_female'] == 1:
        needs_classification = True

    if needs_classification:
        prompt = prompt_creator(row)
        result = run_GPT(prompt)
        #make sure that it is only a number
        result = result.replace('\n', '')
        result = result.replace(' ', '')
        #remove all letters
        result = ''.join([i for i in result if i.isdigit()])
        #add result to the GPT result column
        df_martha.at[index, 'GPT_result'] = result

In [None]:
#cehck the df where GPT_result is not None
df_martha[df_martha['GPT_result'].notnull()].head()

In [None]:
#save csv
df_martha.to_csv('data/bbreindeer_clean_with_GPT_for_Martha.csv', index=False)

Next, I will have to 
* a) evaluate how useful the responses are
* b) code the unsure manually
* c) use these new variables together with the old ones to create the variable indicating whether martha is mentioned or not. 

##### Evaluation of GPT
Here, I will code a random subsample of the data to see how well GPT is doing. I will first create a random subsample of the data, then I will create a functon that lets me code it, and finally I will evaluate the results (only for the martha or not martha). n(1 martha, 2 not martha, 3 unsure)

In [None]:
df_martha = pd.read_csv('data/bbreindeer_clean_with_GPT_for_Martha.csv')
random.seed(123)
df_sub = df_martha[df_martha['GPT_result'].notnull()]
df_sub = df_sub[df_sub['GPT_result']<3]
df_sub = df_sub.sample(50)

In [None]:
df_sub['Human_result'] = None
i = 0
for index, row in df_sub.iterrows():
    #check if the comment needs classification

    prompt = prompt_creator(row)
    #get user coding
    coding = input(prompt)
    df_sub.at[index, 'Human_result'] = coding
    i += 1
    print(i/len(df_sub))

In [None]:
#save to csv
df_sub.to_csv('data/martha_presence_validation.csv')

Lets manually evaluate the instances GPT was unsure about

In [None]:
#code the unsure comments
df_unsure = df_martha[df_martha['GPT_result']==3]
df_unsure['human_lable'] = None
i = 0

for index, row in df_unsure.iterrows():
    #check if the comment needs classification
    prompt = prompt_creator(row)
    #get user coding
    coding = input(prompt)
    df_unsure.at[index, 'GPT_result'] = coding
    i +=1
    print(i/len(df_unsure))

In [None]:
#combine back
df_unsure = df_unsure[['id', 'GPT_result']]
df_martha = df_martha[['id', 'GPT_result']]
#delete observations in df_martha that are in df_unsure
df_martha = df_martha[~df_martha['id'].isin(df_unsure['id'])]

#concatenate the two dfs
df_martha = pd.concat([df_martha, df_unsure])

In [None]:
#save the lables
df_martha.to_csv('data/martha_labels.csv', index=False)

### 2.2 Is Donny/Richard Mentioned?
Same as above but for Donny


In [None]:
donny_words = ['donny', 'richard']
df_donny = df_martha
df_donny['donny_explicit'] = df_donny['Comment'].apply(lambda x: any(word in x.lower() for word in donny_words))
df_donny['donny_explicit'] = df_donny['donny_explicit'].astype(int)
df_donny.head()

In [None]:
indirect_words = ['he', 'his', 'him', 'dude']
pattern = r'\b(?:' + '|'.join(indirect_words) + r')\b'

df_donny['indirect_male'] = df_donny['Comment'].apply(lambda x: bool(re.search(pattern, x.lower())))
df_donny['indirect_male'] = df_donny['indirect_male'].astype(int)

In [None]:
#check how often donny was mentioned explicitly
print(df_donny['donny_explicit'].value_counts())
#check how often there is an implicit mention
print(df_donny['indirect_male'].value_counts())
#check how often there is an implicit mention without an explicit mention
print(df_donny[(df_donny['indirect_male'] == 1) & (df_donny['donny_explicit'] == 0)].shape[0])

He is mentioned explicitly 140 times, there are 426 indirect mentions, and out of those, in 333 cases, Donny is not mentioned explicitly. Those are the ones wee need to classify with GPT. 

In [None]:
prompt_1 = """
Your task is to evaluate whether a reddit comment is about the main character, Donny, or not. Donny is a bartender and standup comedian who is being stalked by a women named Martha, and who for a while dates a transexual called Teri. The comment in question is: ###{}####. """

prompt_2 = """This comment is a reply to the following comment: ###{}###.
""" 

prompt_3 = """Respond with 1 if this comment is about Donny, respond 2 if it is not, respond 3 if you are unsure. Only respond with the number. """

In [None]:
df_donny['GPT_result'] = None
for index, row in df_donny.iterrows():
    #check if the comment needs classification
    needs_classification = False
    if row['donny_explicit'] == 0 and row['indirect_male'] == 1:
        needs_classification = True

    if needs_classification:
        prompt = prompt_creator(row)
        result = run_GPT(prompt)
        #make sure that it is only a number
        result = result.replace('\n', '')
        result = result.replace(' ', '')
        #remove all letters
        result = ''.join([i for i in result if i.isdigit()])
        #add result to the GPT result column
        df_donny.at[index, 'GPT_result'] = result

In [None]:
df_donny.to_csv('data/bbreindeer_clean_with_GPT_for_Donny.csv', index=False)

##### Evaluation of GPT

In [None]:
df_donny = pd.read_csv("data/bbreindeer_clean_with_GPT_for_Donny.csv")
random.seed(123)
df_sub = df_donny[df_donny['GPT_result'].notnull()]
df_sub = df_sub[df_sub['GPT_result']<3]
df_sub = df_sub.sample(50)

In [None]:
df_sub['Human_result'] = None
i = 0
for index, row in df_sub.iterrows():
    #check if the comment needs classification

    prompt = prompt_creator(row)
    #get user coding
    coding = input(prompt)
    df_sub.at[index, 'Human_result'] = coding
    i += 1
    print(i/len(df_sub))

In [None]:
df_sub.to_csv('data/Donny_presence_validation.csv')

In [None]:
#code the unsure comments
df_unsure = df_donny[df_donny['GPT_result']==3]
df_unsure['human_lable'] = None
i = 0

for index, row in df_unsure.iterrows():
    #check if the comment needs classification
    prompt = prompt_creator(row)
    #get user coding
    coding = input(prompt)
    df_unsure.at[index, 'GPT_result'] = coding
    i +=1
    print(i/len(df_unsure))

In [None]:
df_unsure = df_unsure[['id', 'GPT_result']]
df_donny = df_donny[['id', 'GPT_result']]
#delete observations in df_martha that are in df_unsure
df_donny = df_donny[~df_donny['id'].isin(df_unsure['id'])]

#concatenate the two dfs
df_donny = pd.concat([df_donny, df_unsure])

In [None]:
df_donny.to_csv('data/donny_labels.csv', index=False)

### 2.3 Add Vader Sentiment
Lets get some general sentiment scores for all the text. 

In [None]:
df_vader = pd.read_csv("data/bbreindeer_clean.csv")
#create object for sentiment analysis
vader_sentiment = SentimentIntensityAnalyzer()

In [None]:
#create new columns for sentiment analysis

df_vader['Vader_pos'] = df_vader['Comment'].apply(lambda x: vader_sentiment.polarity_scores(x)['pos'])
df_vader['Vader_neg'] = df_vader['Comment'].apply(lambda x: vader_sentiment.polarity_scores(x)['neg'])
df_vader['Vader_neu'] = df_vader['Comment'].apply(lambda x: vader_sentiment.polarity_scores(x)['neu'])
df_vader['Vader_compound'] = df_vader['Comment'].apply(lambda x: vader_sentiment.polarity_scores(x)['compound'])

df_vader.head()

In [None]:
#save to csv
df_vader.to_csv('data/bbreindeer_vader.csv', index=False)

### 2.4 Combine Data to Master Data

In [None]:
#add indicator variables and other ones for the master df
df_all = pd.read_csv('data/bbreindeer_clean.csv')
#df_martha
df_martha = pd.read_csv('data/martha_labels.csv')
#df_donny
df_donny = pd.read_csv('data/donny_labels.csv')


In [None]:
df_martha.rename(columns = {'GPT_result': 'martha'}, inplace = True)
#remove NaN values from martha variable
df_martha = df_martha[df_martha['martha'].notnull()]
df_martha['martha'] = df_martha['martha'].astype(int)

df_martha = df_martha[['id', 'martha']]

In [None]:
df_donny.rename(columns = {'GPT_result': 'donny'}, inplace = True)
#remove NaN values from donny variable
df_donny = df_donny[df_donny['donny'].notnull()]
df_donny['donny'] = df_donny['donny'].astype(int)

df_donny = df_donny[['id', 'donny']]

In [None]:
df_all = df_all.merge(df_martha, on = 'id', how = 'left')
df_all = df_all.merge(df_donny, on = 'id', how = 'left')
df_all['martha'].fillna(2, inplace = True)
df_all['donny'].fillna(2, inplace = True)

In [None]:
df_vader = pd.read_csv('data/bbreindeer_vader.csv')
df_vader = df_vader[['id', 'Vader_pos', 'Vader_neg', 'Vader_neu', 'Vader_compound']]
df_all = df_all.merge(df_vader, on = 'id', how = 'left')

In [None]:
#turn 2 in martha and donny into 0
df_all['martha'] = df_all['martha'].replace(2, 0)
df_all['donny'] = df_all['donny'].replace(2, 0)

In [None]:
#save to csv again
df_all.to_csv('data/MasterData.csv', index = False)

### 3 Create some sub dataframes
Here we just create some dataframes that only have specific people mentioned. 

In [None]:
df_martha_only = df_all[(df_all['martha'] == 1) & (df_all['donny'] == 0)]
df_donny_only = df_all[(df_all['martha'] == 0) & (df_all['donny'] == 1)]
df_donny_and_martha = df_all[(df_all['martha'] == 1) & (df_all['donny'] == 1)]
#random subsample to code
random.seed(123)
df_to_code = df_martha_only.sample(100)

In [None]:
df_martha_only.to_csv('data/martha_only_all.csv', index = False)
df_donny_only.to_csv('data/donny_only_all.csv', index = False)
df_donny_and_martha.to_csv('data/donny_and_martha_all.csv', index = False)
df_to_code.to_csv('data/to_code_sim_aug_11.csv', index = False)

### 4. Sencond Data Enrichment Stage
Now, we are looking for the presence of specific themes in the comments about specific people. Specifically, is the comment empathetic towards the person, is Donny being seen as responsible, and is martha seen as a typical stalker. 

In [None]:
prompt_d_1 = """
###Instructions###
You are an assistant for a content analysis project, and your job is to classify whether specific themes are present in a given reddit comment. The comments are about the main character Donny in the TV show Baby Reindeer, and the themes you should code for are ‘Empathy’, and ‘Responsibility . It is possible that none, one, or both of these themes are present in every given topics. Here a description of the themes.

###Themes 
Empathy: Assess whether the comment reflects an understanding or supportive acknowledgment of Donny’s feelings and experiences. Empathy is indicated by expressions that recognize Donny's perspective or emotional state in a way that shows sympathy or compassion. Code this theme as 1 if empathy is present, 0 if it is not present, and as 99 if it is unclear. 

Responsibility: Assess whether the comment holds Donny responsible for being stalked. This can be done by illustrating how his decisions facilitate the stalking, how he is leading Martha on, or other types of victim blaming. Code this theme as 1 if the comment talks about Donny being stalked and responsible, code it as 0 if, while acknowledging he is getting stalked, the comment doesn’t blame Donny, and as 99 if the comment does not talk about Donny being stalked at all. 

For each of those topics, assess whether they are present in the comment below: 
###Comment###
"""

prompt_d_2 = """

###Output instructions###
Return your response in a json with the following format: 
{
Empathy: [response number], 
Responsibility: [response number],
}
"""

print(prompt_d_1 + "test123" + prompt_d_2)

prompt_m_1 = """
 ###Instructions###
You are an assistant for a content analysis project, and your job is to classify whether specific themes are present in a given reddit comment. The comments are about the main character Martha in the TV show Baby Reindeer, and the themes you should code for are ‘Empathy’, and ‘Traditional_Stalker. It is possible that none, one, or both of these themes are present in every given topic. Here a description of the themes.

###Themes
Empathy: Assess whether the comment reflects an understanding or supportive acknowledgment of Martha’s feelings and experiences. Empathy is indicated by expressions that recognize Martha’s perspective or emotional state in a way that shows sympathy or compassion. Code this theme as 1 if empathy is present, 0 if it is not present, and as 99 if it is unclear.

Traditional_Stalker: Assess to what extent Martha is being characterised as a stalker in the traditional sense. This includes behaviours such as unwanted contact, including phone calls, texts, and contact via social media, unwanted gifts, showing up/approaching an individual or their family/friends, monitoring, surveillance, property damage, and threats. Additionally, it is often assumed that traditional stalkers have underlying mental health issues.
Code a this theme as 1 if Martha is characterised as a traditional stalker in a comment, 0 if the comment talks about her being a stalker but not in the traditional sense, and 99 if the comment is not about her stalking behaviour at all. 



For each of those topics, assess whether they are active in the comment below: 
###Comment###


"""

prompt_m_2 = """
###Output instructions###
Return your response in a json with the following format: 
{
Empathy: [response number], 
Traditional_Stalker: [response number]
}

"""

In [None]:
load_dotenv()
api_key = os.getenv("API_KEY_ME")

client = OpenAI(api_key = api_key)

In [None]:
def run_GPT(prompt, model = 'gpt-4o', temperature = 0):
    """
    Runs the specified GPT model with the given prompt and temperature.

    Args:
        prompt (str): The prompt for the GPT model.
        model (str, optional): The model to use. Defaults to 'gpt-4o'.
        temperature (float, optional): The temperature parameter for the GPT model. Defaults to 0.

    Returns:
        str: The response text from the GPT model.
    """
  #other models are gpt-4-turbo, gpt-4, gpt-3.5-turbo-16k


  #get the response
    response = client.chat.completions.create(
    model=model,
    messages=[{"role": "user", 
               "content": prompt}],
    temperature = temperature
  )

  #get the response
    response_text = response.choices[0].message.content
    return response_text  

#### 4.1 Donny
Classify whether Donny is being seen as a legitimate victim, and whether he is shown empathy. 

In [None]:
df_donny = df[df['donny'] == 1]
df_donny['GPT_result'] = None
df_donny['Empathy'] = None
df_donny['Responsibility'] = None
manual_cleaning = []

In [None]:
i = 0
for index, row in df_donny.iterrows():
    comment = row['Comment']
    prompt = prompt_d_1 + comment + prompt_d_2
    #print(prompt)
    result = run_GPT(prompt)
    #add gpt result
    df_donny.at[index, 'GPT_result'] = result
    #try cleaning it already
    try: 
        #turn gpt result into json
        result = json.loads(result)
        #extract value of key "Empathy"
        empathy = result['Empathy']
        #get value of key "Responsibility"
        responsibility = result['Responsibility']
        #add values to df
        df_donny.at[index, 'Empathy'] = empathy
        df_donny.at[index, 'Responsibility'] = responsibility
    except: 
        manual_cleaning.append(index)
        print('Cannot clean')
        print(result)

    i += 1
    print(i/len(df_donny))
    

In [None]:
#clean the output
for index,row in df_donny.iterrows():
    to_clean = row['GPT_result']
    #split by { and take second, then split by } and take first
    to_clean = to_clean.split('{')[1].split('}')[0]
    #turn to json
    to_clean = json.loads('{' + to_clean + '}')
    #print(to_clean)
    #add to df
    df_donny.at[index, 'Empathy'] = to_clean['Empathy']
    df_donny.at[index, 'Responsibility'] = to_clean['Responsibility']

In [None]:
#save df to csv
df_donny.to_csv('data/donny_empathy_labels.csv', index=False)

#### 4.2 Martha
Classify whether Martha is being seen as a understandbale stalker, and whether she is shown empathy. 

In [None]:
df_martha = df[df['martha'] == 1]
df_martha['GPT_result'] = None
df_martha['Empathy'] = None
df_martha['Traditional_Stalker'] = None

In [None]:
#use gpt to classify marthas comments
i = 0
for index, row in df_martha.iterrows():
    comment = row['Comment']
    prompt = prompt_m_1 + comment + prompt_m_2
    #print(prompt)
    result = run_GPT(prompt)
    df_martha.at[index, 'GPT_result'] = result

    i += 1
    print(i/len(df_martha))

In [None]:
#clean output
fix_manually = []

for index,row in df_martha.iterrows():
    to_clean = row['GPT_result']
    #split by { and take second, then split by } and take first
    to_clean = to_clean.split('{')[1].split('}')[0]
    #turn to json
    #print(to_clean)
    try: 
        to_clean = json.loads('{' + to_clean + '}')
        #print(to_clean)
        #add to df
        
        df_martha.at[index, 'Empathy'] = to_clean['Empathy']
        df_martha.at[index, 'Traditional_Stalker'] = to_clean['Traditional_Stalker']
    except: 
        fix_manually.append((index, to_clean))
        print('Cannot clean')
        print(to_clean)

In [None]:
##Fix a problem case manually
#for row 312 put empathy to 0 and traditional stalker to 99
df_martha.at[312, 'Empathy'] = 0
df_martha.at[312, 'Traditional_Stalker'] = 99

In [None]:
df_martha.to_csv('data/martha_empathy_labels.csv', index=False)
