# 1. Installing required packages

In [1]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [2]:
!pip install transformers requests beautifulsoup4 pandas numpy

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.4 MB/s[0m eta [36m0:00:0

# 2. Importing required packages

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from transformers import pipeline
import warnings
warnings.filterwarnings("ignore")

# 3. Data scraping

Scraping links to all scripts for every episode in the show.

In [4]:
episode_nav = pd.DataFrame(columns=["Season","Episode","Title","URL"])

#links to pages with multiple episodes scripts
pages_arr = ["https://transcripts.foreverdreaming.org/viewforum.php?f=845","https://transcripts.foreverdreaming.org/viewforum.php?f=845&start=78","https://transcripts.foreverdreaming.org/viewforum.php?f=845&start=156"]

#iterating over all pages
for page in pages_arr:
  # scraping data from script website
  r = requests.get(page)
  soup = BeautifulSoup(r.text,'html.parser')
  regex = re.compile('.*topictitle.*')
  results = soup.find_all('a',{'class':regex})
  results = results[1:]

  for i in range(0,len(results)):
    # extracting a link to a single episode and constructing valid url
    templink = results[i].attrs["href"]
    templink = re.findall(r"viewtopic.php\?t=\d+",templink)
    link= "https://transcripts.foreverdreaming.org/" + templink[0]
    # extracting name, episode number and season number
    temptitle = results[i].text
    name = re.split(r"-",temptitle)
    season, episode = re.split(r"x",name[0])
    name = name[1]
    episode_nav.loc[len(episode_nav.index)] = [season,episode,name,link]

episode_nav = episode_nav.loc[2:]
episode_nav = episode_nav.iloc[::-1]
episode_nav.reset_index(inplace=True,drop=True)

# 4. Attaching sentiment to each episode

In [5]:
# BERT sentiment analysis model to detect emotions
classifier = pipeline("text-classification",model='bhadresh-savani/bert-base-uncased-emotion', return_all_scores=True)

# function to get primary emotion conveyed in text from pretrained Bert classfier model.
def emotion_detector(line):
  prediction = classifier(line)
  max = 0
  for i in range(0,len(prediction[0])):
    if prediction[0][i]["score"] > max:
      max = prediction[0][i]["score"]
      strongest_emotion= prediction[0][i]["label"]
  return strongest_emotion

# dataframe that stores emotion percentage
episode_emo = pd.DataFrame(columns=['joy', 'anger', 'fear', 'sadness', 'love', 'surprise'])

# iterate over all episode links
for link in episode_nav["URL"]:

# STEP 1: collect script data into dataframe format
  # scraping script for single episode
  r = requests.get(link)
  soup = BeautifulSoup(r.text,'html.parser')
  regex = re.compile('.*content.*')
  results = soup.find_all('div',{'class':regex})
  text = results[1].text # transcript of episode stored here
  refined_script = pd.DataFrame(columns=['character','dialogue']) # Dataframe to store final script data
  # get dialogues spoken by selected characters
  script = re.findall(r'(\nJoey.*|\nMonica.*|\nChandler.*|\nPhoebe.*|\nRoss.*|\nRachel.*|\nJOEY.*|\nMONICA.*|\nCHANDLER.*|\nPHOEBE.*|\nROSS.*|\nRACHEL.*)',text)
  # reformat lines into speaker and dialogue spoken
  for line in script:
    line_temp = re.split(r":|;",line,maxsplit=1)
    line_temp[0] = re.sub("\n",'',line_temp[0])
    if len(line_temp) == 2:
      refined_script.loc[len(refined_script.index)] = line_temp

# STEP 2: Apply sentiment analysis model on each dialogue spoken
  # apply sentiment analysis model to each dialogue
  refined_script['sentiment'] = refined_script['dialogue'].apply(lambda x: emotion_detector(x[:]))

  # for every emotion detected in script provide the percentage of that emotion
  for emotion in (round(refined_script["sentiment"].value_counts(normalize=True)*100).keys().tolist()):
    emotion_holder = round(refined_script["sentiment"].value_counts(normalize=True)*100)
    if emotion == "joy":
      joy = emotion_holder["joy"]
    elif emotion == "anger":
      anger = emotion_holder["anger"]
    elif emotion == "fear":
      fear = emotion_holder["fear"]
    elif emotion == "sadness":
      sadness = emotion_holder["sadness"]
    elif emotion == "love":
      love = emotion_holder["love"]
    elif emotion == "surprise":
      surprise = emotion_holder["surprise"]

  # append values for each sentiment
  episode_emo.loc[len(episode_emo.index)] = [joy, anger, fear, sadness, love, surprise]

Downloading (…)lve/main/config.json:   0%|          | 0.00/935 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


KeyboardInterrupt: ignored

combining the episode data with emotion data

In [None]:
friends_emotion_data = pd.concat([episode_nav,episode_emo],axis=1)

# creating final dataset
friends_emotion_data.to_excel("final_friends_data.xlsx")