# Imports

In [1]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [2]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import re
from typing import List
import string
import spacy
import ast
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
df = pd.read_csv('../data_raw/parsed_tropes.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29903 entries, 0 to 29902
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   trope_name                  29903 non-null  object
 1   trope_link                  29903 non-null  object
 2   trope_description           29903 non-null  object
 3   related_link_names          29903 non-null  object
 4   related_links               29903 non-null  object
 5   example_descriptions        29903 non-null  object
 6   example_related_links       29903 non-null  object
 7   example_related_link_names  29903 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


# Data Cleaning

In [5]:
def remove_unneeded_phrases(trope_link, description):
  #Link of page was commonly in the text
  modified_description = description.replace(trope_link, '')
  #Articles often started witht he below sentences in the beginning
  standard_beginning_text = ", 'Go To', '&#010;&#010"
  modified_description = re.sub(r'Go To', '', modified_description, count=1)
  modified_description = modified_description.replace(standard_beginning_text, '')
  modified_description = re.sub(r'This page has been alphabetized. Please add new examples in the correct order.','',
                                modified_description, count=1)
  modified_description = re.sub(r'Thanks!','',
                                modified_description, count=1)
  modified_description = re.sub(r"Image removed per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image selected per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image replaced per Image Pickin' thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"Image replaced per Image Pickin' thread",'',
                                modified_description, count=1)
  pattern = r'8212;index&#8212;&#8212;/index&#8212;'
  modified_description = re.sub(pattern,'',
                                modified_description, count=1)
  url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|www\.[a-zA-Z0-9-]+(\.[a-zA-Z]{2,})+'
  modified_description = re.sub(url_pattern,'',
                                modified_description)
  # HTML tags accidently scraped
  tag_pattern = r'<[^>]+>.*?</[^>]+>|<[^>]+/?>'
  modified_description = re.sub(tag_pattern,'',
                                modified_description)
  modified_description = re.sub(r"Image selected via crowner in the Image Suggestion thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"do not change or remove without starting a new thread",'',
                                modified_description, count=1)
  modified_description = re.sub(r"This is based on opinion. Please don't list it on a work's trope example list",'',
                                modified_description, count=1)
  modified_description = re.sub(r"This list of examples has been alphabetized. Please add your example in the proper place",'',
                                modified_description, count=1)
  modified_description = re.sub(r'&#\d+;', '', modified_description)
  return modified_description

In [10]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

In [11]:
def preprocess_text(text):
 
  # Lowercasing words
  text = text.lower()
  # Pass to lemmatizer
  doc = nlp(text)
  tokens = []
  token_loss = []
  token_loss_score = 0

  # process each token
  for token in doc:
      if token.dep_ == 'neg':
          head = token.head.lemma_
          neg_token = 'not_' + head
          tokens.append(neg_token)
      elif token.lemma_.lower() not in stop_words and token.is_alpha:
          tokens.append(token.lemma_)
      elif token.is_alpha:
        token_loss.append(token.lemma_)
        token_loss_score += 1
  return tokens, token_loss, token_loss_score

In [12]:
#Do cleaning only enough to get rid of fragments
df['trope_description_partial_clean'] = df.apply(lambda x: remove_unneeded_phrases(x['trope_link'], x['trope_description']), axis=1)

In [13]:
#Do more traditional cleaning with getting rid of stopwords
df[['trope_description_trad_clean','d_token_loss','d_loss_scores']] = df['trope_description_partial_clean'].apply(preprocess_text).apply(pd.Series)

In [14]:
#Flatten from the traditional cleaning
df['trope_description_trad_clean_flattened'] = df['trope_description_trad_clean'].apply(lambda x: ' '.join(map(str, x)))

In [15]:
df.head()

Unnamed: 0,trope_name,trope_link,trope_description,related_link_names,related_links,example_descriptions,example_related_links,example_related_link_names,trope_description_partial_clean,trope_description_trad_clean,d_token_loss,d_loss_scores,trope_description_trad_clean_flattened
0,Aardvark Trunks,http://tvtropes.org/pmwiki/pmwiki.php/Main/Aar...,['http://tvtropes.org/pmwiki/pmwiki.php/Main/A...,"['/pmwiki/pmwiki.php/Main/SmallTaxonomyPools',...","['are more rarely seen', 'Real Life', 'sucking...","[""Princess Tutu : Anteaterina's snout is rathe...","['/pmwiki/pmwiki.php/Anime/PrincessTutu', '/pm...","['Princess Tutu', ""Boner's Ark"", 'Cerebus the ...","['', '', ' ', '', ""Anteaters and aardvarks are...","[anteater, aardvark, somewhat, common, fiction...","[and, be, in, while, other, such, as, and, be,...",77,anteater aardvark somewhat common fiction ant ...
1,Abandoned Area,http://tvtropes.org/pmwiki/pmwiki.php/Main/Aba...,['http://tvtropes.org/pmwiki/pmwiki.php/Main/A...,"['/pmwiki/pmwiki.php/Main/SceneryGorn', '/pmwi...","['Scenery Gorn', 'After the End', 'explain', '...",['MD Geist has plenty of these areas on Planet...,"['/pmwiki/pmwiki.php/Anime/MDGeist', '/pmwiki/...","['MD Geist', 'Spirited Away', 'Alabaster: The ...","['', '', ' Image from Pripyat, a ghost town ne...","[image, pripyat, ghost, town, near, chernobyl,...","[from, a, the, for, because, the, an, in, the,...",165,image pripyat ghost town near chernobyl nuclea...
2,Abandoned Camp Ruins,http://tvtropes.org/pmwiki/pmwiki.php/Main/Aba...,['http://tvtropes.org/pmwiki/pmwiki.php/Main/A...,"['/pmwiki/pmwiki.php/Main/DontGoInTheWoods', '...","['the woods', 'wolf', 'Grizzly bear', 'some ma...","['Quizzical : From Chapter 4, ""Heavy Traffic I...","['/pmwiki/pmwiki.php/Fanfic/Quizzical', '/pmwi...","['Quizzical', 'Annihilation (2018)', 'Anaconda...","['', '', ' ', '', ""You're lost in the woods , ...","[lose, wood, middle, night, creepy, sound, bra...","[you, in, the, it, the, of, the, and, all, the...",160,lose wood middle night creepy sound branch cre...
3,Abandoned Catchphrase,http://tvtropes.org/pmwiki/pmwiki.php/Main/Aba...,['http://tvtropes.org/pmwiki/pmwiki.php/Main/A...,['/pmwiki/pmwiki.php/Main/CharacterCatchphrase...,"['Character Catchphrases', 'unwanted connotati...",['Birdie the Early Bird of McDonaldland had a ...,['/pmwiki/pmwiki.php/Advertising/McDonaldland'...,"['McDonaldland', 'Bakemonogatari', 'Fist of th...","['', '', ' ', ""Character Catchphrases are a gr...","[character, catchphrase, great, simple, way, h...","[be, a, and, to, a, but, do, the, be, with, on...",56,character catchphrase great simple way help ch...
4,Abandoned Hospital,http://tvtropes.org/pmwiki/pmwiki.php/Main/Aba...,['http://tvtropes.org/pmwiki/pmwiki.php/Main/A...,"['/pmwiki/pmwiki.php/Main/BedlamHouse', '/pmwi...","['mental institution', 'grisly', 'medical', 'e...",['The 6th episode of Bleach was set in an aban...,"['/pmwiki/pmwiki.php/Manga/Bleach', '/pmwiki/p...","['Bleach', 'Don Kanonji', 'Death Note', 'Doubt...","['', '', ' ', '', '', 'The creepy abandoned ho...","[creepy, abandon, mental, institution, common,...","[the, be, a, in, in, and, be, all, other, and,...",224,creepy abandon mental institution common setti...


In [16]:
# Ouptut 
df.to_csv('../data_clean/parsed_tropes_clean.csv')

# Embedding

In [17]:
model = SentenceTransformer('all-mpnet-base-v2')
def get_embedding(text):
  embedding = model.encode(text)
  return embedding
def get_embedding_normalized(text):
  embedding = model.encode(text)
  normalized_embedding = normalize([embedding])[0]
  return embedding

In [18]:
df['n_embedding'] = df['trope_name'].apply(lambda x: get_embedding(x).tolist())
df['n_embedding_normalized'] = df['trope_name'].apply(lambda x: get_embedding_normalized(x).tolist())

In [19]:
def embedding_columns(df:pd.DataFrame, column_name: str, norm: bool):
    df_embedding = pd.DataFrame()
    df_embedding['trope_name'] = df['trope_name']
    
    if norm:
        print('In If statement, norm true')
        df_embedding['n_embedding'] = df['n_embedding_normalized']
        df_embedding['d_embedding'] = df[column_name].apply(lambda x: get_embedding_normalized(x).tolist())
    else:
        print('In If statement, norm false')
        df_embedding['n_embedding'] = df['n_embedding']
        df_embedding['d_embedding'] = df[column_name].apply(lambda x: get_embedding(x).tolist())
        
    return df_embedding
    

In [24]:
df_test = pd.DataFrame()
df_test['trope_name'] = df['trope_name']

In [20]:
embeddings_wanted = {'partial_clean_embeddings':'trope_description_partial_clean', 'partial_clean_embeddings_normalized': 'trope_description_partial_clean',
                     'trad_clean_embeddings': 'trope_description_trad_clean', 'trad_clean_embeddings_normalized': 'trope_description_trad_clean'}
output_path = '../data_clean/embeddings/'

In [26]:
for key, value in embeddings_wanted.items():
    full_path = output_path + key
    if 'normalized' in key:
        print('Normalized')
        df_embedding = pd.DataFrame()
        df_embedding['trope_name'] = df['trope_name']
        print('Past trope_name')
        df_embedding['n_embedding'] = df['n_embedding_normalized']
        print('Past name embedding')
        df_embedding['d_embedding'] = df[value].apply(lambda x: get_embedding_normalized(x).tolist())
        
        df_embedding.to_csv(full_path)
        print(f'Printed {key} at {full_path}.')
    else:
        print('Not Normalized')
        df_embedding = pd.DataFrame()
        df_embedding['trope_name'] = df['trope_name']
        print('Past trope_name')
        df_embedding['n_embedding'] = df['n_embedding']
        print('Past name embedding')
        df_embedding['d_embedding'] = df[value].apply(lambda x: get_embedding_normalized(x).tolist())
        
        df_embedding.to_csv(full_path)
        print(f'Printed {key} at {full_path}.')

Not Normalized
Past trope_name
Past name embedding
Printed partial_clean_embeddings at ../data_clean/embeddings/partial_clean_embeddings.
Normalized
Past trope_name
Past name embedding
Printed partial_clean_embeddings_normalized at ../data_clean/embeddings/partial_clean_embeddings_normalized.
Not Normalized
Past trope_name
Past name embedding


ValueError: Found array with dim 3. the normalize function expected <= 2.

In [22]:
for key, value in embeddings_wanted.items():
    full_path = output_path + key
    if 'normalized' in key:
        print('Normalized')
        df_embedding = embedding_columns(df, value, True)
        df_embedding.to_csv(full_path)
        print(f'Printed {key} at {full_path}.')
    else:
        print('Not Normalized')
        df_embedding = embedding_columns(df, value, False)
        df_embedding.to_csv(full_path)
        print(f'Printed {key} at {full_path}.')

Not Normalized


KeyboardInterrupt: 