# Imports

In [1]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
from typing import List
import string
import spacy
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize


In [None]:
df = pd.read_csv('../data_raw/parsed_tropes.csv')

In [None]:
df.info()

# Data Cleaning

In [None]:
def remove_unneeded_phrases(trope_link, description):
  #Link of page was commonly in the text
  modified_description = description.replace(trope_link, '')
  #Articles often started witht he below sentences in the beginning
  standard_beginning_text = ", 'Go To', '&#010;&#010"
  modified_description = re.sub(r'Go To', '', modified_description, count=1)
  modified_description = modified_description.replace(standard_beginning_text, '')
  modified_description = re.sub(r'This page has been alphabetized. Please add new examples in the correct order.','',
                                modified_description, count=1)
  modified_description = re.sub(r'Thanks!','',
                                modified_description, count=1)
  modified_description = re.sub(r"Image removed per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image selected per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image replaced per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image replaced per Image Pickin' thread",'',
                                modified_description, count=1)
  pattern = r'8212;index&#8212;&#8212;/index&#8212;'
  modified_description = re.sub(pattern,'',
                                modified_description, count=1)
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.[a-zA-Z0-9-]+(\.[a-zA-Z]{2,})+'
  modified_description = re.sub(url_pattern,'',
                                modified_description)
  # HTML tags accidently scraped
  tag_pattern = r'<[^>]+>.*?</[^>]+>|<[^>]+/?>'
  modified_description = re.sub(tag_pattern,'',
                                modified_description)
  modified_description = re.sub(r"Image selected via crowner in the Image Suggestion thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"do not change or remove without starting a new thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"This is based on opinion. Please don't list it on a work's trope example list",'',
                                modified_description, count=1)
  modified_description = re.sub(r"This list of examples has been alphabetized. Please add your example in the proper place",'',
                                modified_description, count=1)

  return modified_description

In [None]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
  text_loss = []
  text_loss_scores = []
  # Lowercasing words
  text = text.lower()
  # Pass to lemmatizer
  doc = nlp(text)
  tokens = []
  token_loss = []
  token_loss_score = 0

  # process each token
  for token in doc:
      if token.dep_ == 'neg':
          head = token.head.lemma_
          neg_token = 'not_' + head
          tokens.append(neg_token)
      elif token.lemma_.lower() not in stop_words and token.is_alpha:
          tokens.append(token.lemma_)
      elif token.is_alpha:
        token_loss.append(token.lemma_)
        token_loss_score += 1
  return tokens, text_loss, text_loss_scores

In [None]:
df['trope_description_partial_clean'] = df.apply(lambda x: remove_unneeded_phrases(x['trope_link'], x['trope_description']), axis=1)

In [None]:
df[['description_trad_clean','d_text_loss','d_loss_scores']] = df['trope_description_partial_clean'].apply(preprocess_text).apply(pd.Series)

In [None]:
df['description_clean_flattened'] = df['description_trad_clean'].apply(lambda x: ' '.join(map(str, x)))

# Embedding

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')
def get_embedding(text):
  embedding = model.encode(text)
  return embedding
def get_embedding_normalized(text):
  embedding = model.encode(text)
  normalized_embedding = normalize([embedding])[0]
  return embedding

In [None]:
def embedding_columns(df:pd.Dataframe, column_name: str, norm: bool, output_path: str):
    df_embedding = pd.DataFrame
    df_embedding['trope_name'] = df['trope_name']
    df_embedding['n_embedding'] = df['n_embedding']
    if norm:
        df_embedding['d_embedding'] = df[column_name].apply(lambda x: get_embedding_normalized(x).tolist())
    else:
        df_embeddding['d_embedding'] = df[column_name].apply(lambda x: get_embedding(x).tolist())
    df_embedding.to_csv(output_path)

In [None]:
df['d_embedding_trad_clean'] = df['description_clean_flattened'].apply(lambda x: get_embedding(x).tolist())
df['n_embedding'] = df['trope_name'].apply(lambda x: get_embedding(x).tolist())

In [None]:
df.to_csv('../results/final_data_normalized.csv', index=False)