# Get Character quotes

For doing sentiment analyse, we propose the use of the quotes of each character. For that we need to compilate all the quotes from each character.

That's fine, as inside the wiki, each character has a 'Quotes' page wich can be accessed by `Category:{Name}/Quotes` (https://marvel.fandom.com/wiki/Category:Peter_Parker_(Earth-616)/Quotes). Easy right?. Well, even tho that page exist, if you try to make a query to it, you will most likely get `{{Quotes}}`

In [1]:
import json
import urllib.request

import re

import pandas as pd
import numpy as np

import utils
import os

from concurrent.futures import ThreadPoolExecutor, as_completed

from tqdm.notebook import tqdm
tqdm.pandas()

In [2]:
char_df = pd.read_csv("../data/marvel_characters.csv", index_col=0)
char_df.head()

Unnamed: 0,title,pageid,imglink,gender,links
0,'Spinner (Earth-616),322600,'Spinner (Earth-616) from Marvel Comics Presen...,['Male'],[]
1,01100010 01110010 01110101 01110100 01100101 (...,1053805,01100010 01110010 01110101 01110100 01100101 (...,[''],"['Cosmo (Dog) (Earth-616)', 'Rocket Raccoon (E..."
2,107 (Earth-616),116257,107 (Earth-616) from Justice Four Balance Vol ...,['Male'],"['Vance Astrovik (Earth-616)', 'Vance Astrovik..."
3,11-Ball (Earth-616),543479,11-Ball (Earth-616) from Sleepwalker Vol 1 2 0...,['Male'],"['Jeff Hagees (Earth-616)', 'Jeff Hagees (Eart..."
4,115 (Legion Personality) (Earth-616),624448,115 (Legion Personality) (Earth-616) from New ...,['Female'],"['David Haller (Earth-616)', 'David Haller (Ea..."


In [3]:
def search_quotes(titles: list):
  baseurl = "https://marvel.fandom.com/api.php?"
  action = "action=query"
  title = f"titles={'|'.join([urllib.parse.quote_plus(title.replace(' ', '_')) for title in titles])}"
   
  content = "prop=revisions&rvprop=content&rvslots=*"
  dataformat ="format=json"

  query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

  wikiresponse = urllib.request.urlopen(query)
  wikidata = wikiresponse.read()
  wikitext = wikidata.decode('utf-8')
  return json.loads(wikitext)

# Downloading all quotes from the characters in  ``data/marvel_characters.csv``

To do this we multithread to do simultanious queries, and query for 50 titles at a time, as that is the max amount of titles allowed, and the background DB on the wiki page is much faster, than the internet connection between our machine and theirs.

In [4]:
quote_path = "../data/character_quotes/"

def get_character_quotes(name: str):
  # This is a little bit hacky, but works
  cmcontinue_text = ""
  quote_titles = []

  # Breaks when there are no more cm_continues
  while True:
    baseurl = "https://marvel.fandom.com/api.php?"
    args = {
      "action"      : "action=query&list=categorymembers",
      "q_title"     : "cmtitle=Category:{}/Quotes".format(urllib.parse.quote_plus(name.replace(" ", "_"))),
      "content"     : "prop=revisions&rvprop=content&rvslots=*",
      "dataformat"  : "format=json",
      "cmcontinue"  :  "cmlimit=max&cmcontinue={}".format(cmcontinue_text),
    }
    
    query = f"{baseurl}{'&'.join(args.values())}"

    wikiresponse = urllib.request.urlopen(query)
    wikitext = wikiresponse.read().decode('utf-8')
    wiki_json = json.loads(wikitext)
    
    quote_titles += [page["title"] for page in wiki_json["query"]["categorymembers"]]

    if "continue" in list(wiki_json.keys()):
      cmcontinue_text = wiki_json["continue"]["cmcontinue"]
    else: break
  
  quote_title_chunks = utils.generate_chunks(quote_titles)
  quotes = []

  for chunk in quote_title_chunks:
    quote_data = search_quotes(chunk)
    for content in quote_data["query"]["pages"].values():
      content  = content["revisions"][-1]["slots"]["main"]["*"]
      quotes += re.findall(r"Quotation.*?= (.*?)\n", content)
  
  filename = utils.generate_filename(name)
  with open(f"{quote_path}{filename}.json", "w") as f:
    json.dump(quotes, f, indent = 4)

def get_chunk_quotes(chunk: list):
  for name in chunk:
    get_character_quotes(name)
  return

def get_quotes(names: list, max_workers=16):
  files = set(os.listdir(quote_path))
  missing_names = list(
    filter(lambda x: f"{utils.generate_filename(x)}.json" not in files,
    names)
  )

  if len(missing_names) == 0:
    print("No missign quotes found 😊")
    return
  
  chunks = utils.generate_chunks(missing_names)
  print (f"Generated {len(chunks)} chunks!")
  with tqdm(total=len(chunks)) as pbar:
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
      futures = [ex.submit(get_chunk_quotes, chunk)
                  for chunk in chunks]
      for future in as_completed(futures):
        pbar.update(1)


get_quotes(char_df.title.values , max_workers=24)

Generated 600 chunks!


  0%|          | 0/600 [00:00<?, ?it/s]

https://marvel.fandom.com/api.php?action=query&list=categorymembers&cmtitle=Category:Peter_Parker_(Earth-616)/Quotes&cmlimit=500&prop=revisions&rvprop=content&rvslots=*&format=json