# Marvel Height, Weight, & Gender

## Author: Oliver Gladfelter

## Date: Sep 4, 2018

### Purpose: Collecting name, universe, gender, height, weight, and introduction year of Marvel characters

In [1]:
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Data Collection
## Marvel Weight Category 
This wikia category contains every single character with a listed weight, as well as a link to each character's individual page. The category only lists 200 characters per page, meaning to access all ~5k character links, we need to iterate over all 26 pages. The URL below links to the first page of the Marvel weight category.

In [2]:
weightPage1 = 'http://marvel.wikia.com/wiki/Category:Weight'

## Function to Web Scrape Links to Every Character Within a Marvel Wiki Category
This function takes a single URL to a Marvel category page - ideally the first page - and scrapes character links from the page and all following pages within the category. Returns a list containing links to every character's wiki page.

In [36]:
def getLinksFromCategory(url):
    
    links = []
    lastLinkText = 'next 200'
    
    # every category page contains 200 character links, which we're collecting.
    # every page always has 'previous 200' and 'next 200' hyperlinked text, with links to
    # previous and next pages. We use these links to advance through the category pages. 
    # But when the 'next 200' text isn't hyperlinked, it won't be added to the list. This 
    # tells the loop we're on the last page and should stop
    while 'next' in lastLinkText:
        r = requests.get(url)

        if r.status_code != 200:
            continue 
        else:
            soup = BeautifulSoup(r.content, "lxml")

        # returns the last three unordered lists of the webpage
        unorderedLists = soup.find_all("ul",{'class':''})[1:]

        # for each unordered list (total of 3), grab the list of characters and save the wiki link to each one
        for unorderedList in unorderedLists:
            listOfCharacters = unorderedList.find_all('li')
            for character in listOfCharacters:
                if character.find('a') is None:
                    continue
                links.append('http://marvel.wikia.com' + character.find('a')['href'])
        
        # a list of links to the previous and next category pages
        previousNextLinks = soup.find_all('a',{'title':'Category:Weight'})
        # last one always be text attributed as 'next 200', unless we're on the last page
        lastLinkText = previousNextLinks[-1].text

        # create an url for the next page
        url = "http://marvel.wikia.com" + previousNextLinks[1]['href']
        
    return links

## Function to Web Scrape Marvel Character Pages
When given a list of links to character pages, this function will scrape the gender, height, weight, and first appearance of each character and return a data frame containing a row for each character

In [160]:
def getDataFromCharacterPage(listOfURLs):
    
    linkList = []
    genderList = []
    heightList = []
    weightList = []
    firstAppearanceList = []

    for link in listOfURLs:

        r = requests.get(link)

        if r.status_code != 200:
            print("Error")
        else:
            soup = BeautifulSoup(r.content, "lxml")

        # return an array of each characteristic banner, which includes a measure name and value in one string
        stats = soup.findAll("div",{"class": "pi-item pi-data pi-item-spacing pi-border-color"})

        gender = ''
        height = ''
        weight = ''

        # regEx to get only the vital info
        for stat in stats:
            if 'Gender' in stat.text:
                if re.search('Gender\n(.{,})',stat.text) is not None:
                    gender = re.search('Gender\n(.{,})',stat.text).group(1)
            if 'Height' in stat.text:
                if re.search('Height\n(.{,})\n',stat.text) is not None:
                    height = re.search('Height\n(.{,})\n',stat.text).group(1)
            if 'Weight' in stat.text:
                if re.search('Weight\n(.{,})\n',stat.text) is not None:
                    weight = re.search('Weight\n(.{,})\n',stat.text).group(1)

        linkList.append(link)
        genderList.append(gender)
        heightList.append(height)
        weightList.append(weight)
        firstAppearanceList.append(soup.find("td",{"class": "pi-horizontal-group-item pi-data-value pi-font pi-border-color pi-item-spacing"}).text)

    return pd.DataFrame({'link':linkList,'gender':genderList,'height':heightList,'weight':weightList,'firstAppearance':firstAppearanceList})

## Scrape Links to ~5k Character Pages

In [None]:
CharacterPageLinks = getLinksFromCategory(weightPage1)

## Scrape Each Character Page for Weight, Height, Gender Data

In [161]:
data = getDataFromCharacterPage(CharacterPageLinks)

# Data Cleaning

## Removing characters who don't have all variables of interested listed on their Wiki page

In [10]:
data2 = data[(data['gender'] != '') & (data['height'] != '') & (data['weight'] != '')]
data2 = data2.reset_index()
del data2['index']

## Parsing Out First Appearance Dates From The 'firstAppearance' Column
The 'firstAppearance' value includes both the first publication in which the character was introduced as well as a date (in varying formats). It is the date, particularly the year, that we are interested in.

In [11]:
def getDate(value):
    if re.search('.{,}\((.{3,14}\s?\d{4})\)',value) is not None:
        return re.search('.{,}\((.{3,14}\s?\d{4})\)',value).group(1)
    elif re.search('\d{4}',value) is not None:
        return re.search('\d{4}',value).group(0)
    else:
        return 0
    
# this will result in a few 0s, where the first appearance date was missing from the character's page
data2['firstAppearanceDate'] = data2['firstAppearance'].apply(getDate)

In [78]:
data2.tail(3)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate
5082,http://marvel.wikia.com/wiki/Wicked_(Earth-616),Female,"5' 1""",99 lbs (45 kg),"Excalibur Vol 3 #1(July, 2004)","July, 2004"
5083,http://marvel.wikia.com/wiki/Oberon_(Earth-616),Male,"15' 4"" (Variable)",990 lbs (449 kg) (Variable),"Wisdom #1(January, 2007)","January, 2007"
5084,http://marvel.wikia.com/wiki/Reptyl_(Earth-616),Male,"6' 8"" 15'6"" as Reptyl Prime","990 lbs (449 kg) 420 lbs (original form), 5200...","Silver Surfer Vol 3 #11(May, 1988)","May, 1988"


## Scraping Comic Book Pages for Release Dates
A publication date wasn't listed in some cases, but the name of the introducing medium (comic book, TV, video game, etc) always is. In those cases, we need to scrape the comic book on Marvel Wiki to get the publication date.

In [14]:
missingDates = data2[data2['firstAppearanceDate'] == 0]
missingDates = missingDates.reset_index()
del missingDates['index']

In [23]:
linkBase = 'http://marvel.wikia.com/wiki/'

for num in range(0,len(missingDates)):
    
    newDate = ""
    
    # Uses the linkBase and comic book name to construct a URL
    url = linkBase + missingDates['firstAppearance'][num].replace(" ","_")
    r = requests.get(url)
    
    if r.status_code != 200:
        # This 1986 book accounts for several missing dates, although the Marvel Wiki doesn't have a page for it
        if 'Gates of What If' in missingDates['firstAppearance'][num]:
            newDate = '1986'
        else:
            missingDates['firstAppearanceDate'][num] = 'Needs manual check'
            continue
    else:
        soup = BeautifulSoup(r.content, "lxml")
    
        # There are two different main formats for pages for mediums in which characters may have been introduced
        # Scrape the publication date of the media, being flexible of multiple page formats
        if len(soup.find_all("div",{'style':'width:160px;float:left;text-align:right;'})) > 1:
            newDate = soup.find_all("div",{'style':'width:160px;float:left;text-align:right;'})[-1].text
        elif len(soup.find_all('div',{'style':"font-size:12px;text-align:center;line-height:2em;"})) > 1:
            newDate = soup.find_all('div',{'style':"font-size:12px;text-align:center;line-height:2em;"})[1].text
        
    missingDates['firstAppearanceDate'][num] = newDate 

In [None]:
missingDates['firstAppearanceDate'][3] = '2009'
missingDates['firstAppearanceDate'][4] = '2006'
missingDates['firstAppearanceDate'][16] = ''
missingDates['firstAppearanceDate'][24] = '2002'
missingDates['firstAppearanceDate'][26] = '2009'
missingDates['firstAppearanceDate'][27] = '2009'
missingDates['firstAppearanceDate'][31] = '1999'
missingDates['firstAppearanceDate'][38] = '2008'
missingDates['firstAppearanceDate'][43] = '2013'
missingDates['firstAppearanceDate'][46] = ''
missingDates['firstAppearanceDate'][48] = '2003'

missingDates = missingDates[missingDates['firstAppearanceDate'] != '']

In [79]:
missingDates.tail(3)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate
58,http://marvel.wikia.com/wiki/Death%27s_Head_(E...,Male,"6' 10"" ; Formerly 30'; 7'",605 lbs (274 kg) ; Formerly 25 tons; 650 lbs,High Noon Tex,1987
59,http://marvel.wikia.com/wiki/Michael_Collins_(...,Male,"7' 2""",825 lbs (374 kg),Deeds of Doom,"July, 1992"
60,http://marvel.wikia.com/wiki/Aimi_Yoshida_(Ear...,Female,"5' 1""",95 lbs (43 kg),X-Men: Destiny,"September, 27, 2011"


## Reattaching Updated Rows To Main Dataframe

In [110]:
notMissingDates = data2[data2['firstAppearanceDate'] != 0]
data3 = pd.concat([notMissingDates,missingDates])
data3 = data3.reset_index()
del data3['index']

In [111]:
data3 = data3.dropna()

## Derive Normalized Year, Gender, Height, and Weight Columns

In [119]:
def getYear(value):
    return int(re.search('\d{4}',value).group(0))
    
data3['yearFirstAppeared'] = data3['firstAppearanceDate'].apply(getYear)

In [113]:
def MaleFemale(value):
    if (value != 'Male') and (value != 'Female'):
        return -1
    else:
        return value

data3['gender'] = data3['gender'].apply(MaleFemale)

# Removing characters where gender is either missing or listed as agender or fluid, for analytical purposes
data3 = data3[data3['gender'] != -1]
data3 = data3.reset_index()
del data3['index']

In [None]:
def calcHeight(value):
    
    if re.search('(\d)[\',’]\s?(\d{1,2})',value) is None:
        return -1
    
    feet = int(re.search('(\d)[\',’]\s?(\d{1,2})',value).group(1))
    inch = int(re.search('(\d)[\',’]\s?(\d{1,2})',value).group(2))
    return (feet * 12) + inch

data3['heightInInches'] = data3['height'].apply(calcHeight)

In [None]:
# Some manual conversions (all of these character's heights were listed in centimeters, meters, and even miles)
data3['heightInInches'][1001] = 67.992126
data3['heightInInches'][1239] = 4724.41
data3['heightInInches'][1741] = 3543.31
data3['heightInInches'][3101] = 31680
data3['heightInInches'][4085] = 11.811
data3['heightInInches'][4086] = 12
data3['heightInInches'][4087] = 12
data3['heightInInches'][4863] = 54.3307
data3['heightInInches'][4867] = 9

In [194]:
# dropping 9 characters whose heights were listed only as 'Variable'
data3 = data3[data3['heightInInches'] != -1]
data3 = data3.reset_index()
del data3['index']

In [219]:
def calcWeight(value):
    
    if re.search('\d{1,3}',value) is None:
        return -1

    return int(re.search('\d{1,3}',value).group(0))

data3['weightInPounds'] = data3['weight'].apply(calcWeight)

In [6]:
data3 = pd.read_csv("MarvelData9-6-18.csv", encoding = 'latin')
del data3['Unnamed: 0']

## Parsing 'link' Column For Character Name and Universe

In [36]:
def getName(value):
    if re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value) is None:
        return -1
    return re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value).group(1).replace("_"," ")

def getUniverse(value):
    if re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value) is None:
        return -1
    return re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value).group(2).replace("_"," ")

data3['name'] = data3['link'].apply(getName)
data3['universe'] = data3['link'].apply(getUniverse)

data3 = data3[data3['name'] != -1]
data3 = data3.reset_index()
del data3['index']

In [63]:
data3.tail(2)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate,yearFirstAppeared,heightInInches,weightInPounds,name,universe
4981,http://marvel.wikia.com/wiki/Michael_Collins_(...,Male,"7' 2""",825 lbs (374 kg),Deeds of Doom,"July, 1992",1992,86,825,Michael Collins,Earth-TRN564
4982,http://marvel.wikia.com/wiki/Aimi_Yoshida_(Ear...,Female,"5' 1""",95 lbs (43 kg),X-Men: Destiny,"September, 27, 2011",2011,61,95,Aimi Yoshida,Earth-TRN064


In [162]:
data3.to_csv("C:\\Users\\glol7001\\Documents\\MarvelDataComplete.csv")