# Appearance Counts, Character Pictures, and Name Variations

## Author: Oliver Gladfelter

## Date: October 8th, 2018

### Purpose: Collecting data needed for end-product design. Appearance counts size each point in the beeswarm plot, name variations add flexibility in the 'character-look-up' interactive, and character pictures are used throughout in highlighting specific examples.

In [2]:
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
data = pd.read_csv("normalHeightData.csv")
del data['Unnamed: 0']

# Function to scrape number of appearances for each character

In [47]:
def getAppearanceCount(link):

    r = requests.get(link)

    if r.status_code != 200:
        return -1
    else:
        soup = BeautifulSoup(r.content, "lxml")

    uls = soup.findAll("ul")

    for ul in uls:
        if re.search("Appearances of",ul.text) is not None:
            appearanceItem = ul

    appearanceCount = appearanceItem.findAll("li")[0].text

    appearanceString = re.search("\s?(.{,})\sApp",appearanceCount).group(1)
    
    # catching an error
    if (appearanceString == 'First') | (appearanceString == ''):
        return -2
    
    # If a value has at least a thousand views, there is a comma in the
    # string which must be removed before converting to int
    return int(appearanceString.replace(",",""))

In [110]:
data['appearances'] = data['link'].apply(getAppearanceCount)

# Function to scrape picture link for each character's featured image

In [241]:
def getPic(link):
    
    r=requests.get(link)
    if r.status_code != 200:
        return -1
    
    soup = BeautifulSoup(r.content, "lxml")
    
    # not all characters have a picture
    try:
        return soup.find('figure',{'class':'pi-item pi-image'}).find('a')['href']
    except:
        return -2

In [295]:
def getImageFromGoogle(name,universe):
    """
    For characters who don't have a picture on their Wikia page, we need a work around
    This function Google Image searches the character's name and publisher (Marvel or DC)
    then returns a link to the first result
    """
    
    name = name.replace(" ","+")
    
    link = "https://www.google.com/search?q={}+{}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwiixLLTvfDdAhXyYN8KHXbaAQoQ_AUIDygC&biw=1366&bih=626#imgrc=npqj4x8uQAFjVM:".format(name,universe)
    
    r=requests.get(link)
    soup = BeautifulSoup(r.content, "lxml")
    
    return soup.findAll('img')[0]['src']

In [None]:
data['picLink'] = data['link'].apply(getPic)

In [None]:
for num in range(0,len(data)):
                 
    if data['picLink'][num] == -2:
        data['picLink'][num] = getImageFromGoogle(data['name'][num],data['publisher'][num])

# Scrape Superhero Identity & Nicknames

In [386]:
def getSuperheroName(link):
    """
    All character pages list the character's birth/legal name as its title
    For example, Captain America's main page is titled "Steven Rogers"
    This function collects each individual's superhero name
    """

    r=requests.get(link)
    if r.status_code != 200:
        return -1
    soup = BeautifulSoup(r.content, "lxml")
    
    return soup.find('h2',{'pi-item pi-item-spacing pi-title'}).text

def getFullName(link):
    """
    Several characters have nicknames. For example, we all know Ironman
    to be the superhero name of Tony Stark, but Stark's birth/legal name is
    "Anthony Stark", which is the name we originally collected. This function
    collects characters' full names, which include their nickname
    """
    
    r=requests.get(link)
    if r.status_code != 200:
        return -1
    soup = BeautifulSoup(r.content, "lxml")
    
    name = soup.find('div',{'class':'pi-data-value pi-font'}).text
    
    # Removing citation notation and leading space
    name = name.replace(" [1]","").strip(" ")
    name = name.replace("[1]","")
    name = name.replace(" [2]","")
    name = name.replace("[2]","")
    name = name.replace(" [3]","")
    return name.replace("[3]","")

def nickName(fullName):
    """
    Parses a nickname out of a full name, if a character has a nickname
    """
    
    if fullName == -1:
        return fullName
    
    # No listed nickname in full name
    if re.search("[\"\'](.{,})[\"\']",fullName) is None:
        return ""
    
    # Return nickname + last name
    nickName = re.search("[\"\'](.{,})[\"\']",fullName).group(1)
    lastName = fullName.split(" ")[-1]
    return nickName + " " + lastName

In [None]:
data['superName'] = data['link'].apply(getSuperheroName)
data['fullName'] = data['link'].apply(getFullName)
data['nickName'] = data['fullName'].apply(nickName)