# DC Height, Weight, & Gender

## Author: Oliver Gladfelter

## Date: Sep 6, 2018

### Purpose: Collecting name, universe, gender, height, weight, and introduction year of DC characters

In [8]:
import lxml.html as lh
import io
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Data Collection
## DC Weight Category 
This wikia category contains every single character with a listed weight, as well as a link to each character's individual page. The category only lists 200 characters per page, meaning to access all ~5k character links, we need to iterate over all 26 pages. The URL below links to the first page of the DC weight category.

In [6]:
weightPage1 = 'http://dc.wikia.com/wiki/Category:Weight'

## Function to Web Scrape Links to Every Character Within a DC Wiki Category
This function takes a single URL to a DC category page - ideally the first page - and scrapes character links from the page and all following pages within the category. Returns a list containing links to every character's wiki page.

In [2]:
def getLinksFromCategory(url):
    
    links = []
    lastLinkText = 'next 200'
    
    # every category page contains 200 character links, which we're collecting.
    # every page always has 'previous 200' and 'next 200' hyperlinked text, with links to
    # previous and next pages. We use these links to advance through the category pages. 
    # But when the 'next 200' text isn't hyperlinked, it won't be added to the list. This 
    # tells the loop we're on the last page and should stop
    while 'next' in lastLinkText:
        r = requests.get(url)

        if r.status_code != 200:
            continue 
        else:
            soup = BeautifulSoup(r.content, "lxml")

        # returns the last three unordered lists of the webpage
        unorderedLists = soup.find_all("ul",{'class':''})[1:]

        # for each unordered list (total of 3), grab the list of characters and save the wiki link to each one
        for unorderedList in unorderedLists:
            listOfCharacters = unorderedList.find_all('li')
            for character in listOfCharacters:
                if character.find('a') is None:
                    continue
                links.append('http://dc.wikia.com' + character.find('a')['href'])
        
        # a list of links to the previous and next category pages
        previousNextLinks = soup.find_all('a',{'title':'Category:Weight'})
        # last one always be text attributed as 'next 200', unless we're on the last page
        lastLinkText = previousNextLinks[-1].text

        # create an url for the next page
        url = "http://dc.wikia.com" + previousNextLinks[1]['href']
        
    return links

## Function to Web Scrape DC Character Pages
When given a list of links to character pages, this function will scrape the gender, height, weight, and first appearance of each character and return a data frame containing a row for each character

In [5]:
def getDataFromCharacterPage(listOfURLs):
    
    linkList = []
    genderList = []
    heightList = []
    weightList = []
    firstAppearanceList = []

    for link in listOfURLs:

        r = requests.get(link)

        if r.status_code != 200:
            print("Error")
        else:
            soup = BeautifulSoup(r.content, "lxml")

        # return an array of each characteristic banner, which includes a measure name and value in one string
        stats = soup.findAll("div",{"class": "pi-item pi-data pi-item-spacing pi-border-color"})

        gender = ''
        height = ''
        weight = ''

        # regEx to get only the vital info
        for stat in stats:
            if 'Gender' in stat.text:
                if re.search('Gender\n(.{,})',stat.text) is not None:
                    gender = re.search('Gender\n(.{,})',stat.text).group(1)
            if 'Height' in stat.text:
                if re.search('Height\n(.{,})\n',stat.text) is not None:
                    height = re.search('Height\n(.{,})\n',stat.text).group(1)
            if 'Weight' in stat.text:
                if re.search('Weight\n(.{,})\n',stat.text) is not None:
                    weight = re.search('Weight\n(.{,})\n',stat.text).group(1)

        linkList.append(link)
        genderList.append(gender)
        heightList.append(height)
        weightList.append(weight)
        firstAppearanceList.append(soup.find("td",{"class": "pi-horizontal-group-item pi-data-value pi-font pi-border-color pi-item-spacing"}).text)

    return pd.DataFrame({'link':linkList,'gender':genderList,'height':heightList,'weight':weightList,'firstAppearance':firstAppearanceList})

## Scrape Links to ~1.5k Character Pages

In [9]:
CharacterPageLinks = getLinksFromCategory(weightPage1)

## Scrape Each Character Page for Weight, Height, Gender Data

In [13]:
data = getDataFromCharacterPage(CharacterPageLinks)

# Data Cleaning

## Removing characters who don't have all variables of interested listed on their Wiki page

In [17]:
data2 = data[(data['gender'] != '') & (data['height'] != '') & (data['weight'] != '')]
data2 = data2.reset_index()
del data2['index']

## Parsing Out First Appearance Dates From The 'firstAppearance' Column
The 'firstAppearance' value includes both the first publication in which the character was introduced as well as a date (in varying formats). It is the date, particularly the year, that we are interested in.

In [19]:
def getDate(value):
    if re.search('.{,}\((.{3,14}\s?\d{4})\)',value) is not None:
        return re.search('.{,}\((.{3,14}\s?\d{4})\)',value).group(1)
    elif re.search('\d{4}',value) is not None:
        return re.search('\d{4}',value).group(0)
    else:
        return 0
    
# this will result in a few 0s, where the first appearance date was missing from the character's page
data2['firstAppearanceDate'] = data2['firstAppearance'].apply(getDate)

In [20]:
data2.tail(3)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate
1460,http://dc.wikia.com/wiki/Thaddeus_Sivana_(Eart...,Male,"5' 2""",98 lbs (44 kg),"Whiz Comics #2(February, 1940)","February, 1940"
1461,http://dc.wikia.com/wiki/Cyrus_Gold_(New_Earth),Male,"6' 7"" [2]",980 lbs (445 kg) [2],"All-American Comics #61(October, 1944)","October, 1944"
1462,http://dc.wikia.com/wiki/Imperiex-Prime_(New_E...,Male,"6' 7""",986 lbs (447 kg),"Superman Vol 2 #153(February, 2000)","February, 2000"


## Filling In Missing Release Dates
A publication date wasn't listed in some cases, but the name of the introducing medium (comic book, TV, video game, etc) always is. In those cases, we need to scrape the comic book on Marvel Wiki to get the publication date.

In [21]:
missingDates = data2[data2['firstAppearanceDate'] == 0]
missingDates = missingDates.reset_index()
del missingDates['index']

In [36]:
dates = ['2012',-1,-1,'2001','1989','2015',-1,'1988',-1,'1993','1993','2011']

for num in range(0,len(missingDates)):
    missingDates['firstAppearanceDate'][num] = dates[num]
    
missingDates = missingDates[missingDates['firstAppearanceDate'] != -1]

In [37]:
missingDates.tail(3)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate
9,http://dc.wikia.com/wiki/Percival_Edmund_Chang...,Male,"5' 3""",250 lbs (113 kg),Deathmate Black,1993
10,http://dc.wikia.com/wiki/Caitlin_Fairchild_(Wi...,Female,"6' 4""",300 lbs (136 kg),Deathmate Black,1993
11,http://dc.wikia.com/wiki/Chaselon_(Green_Lante...,Male,"10' 0""",720 lbs (327 kg),[[Green Lantern (Movie)]],2011


## Reattaching Updated Rows To Main Dataframe

In [38]:
notMissingDates = data2[data2['firstAppearanceDate'] != 0]
data3 = pd.concat([notMissingDates,missingDates])
data3 = data3.reset_index()
del data3['index']

In [39]:
data3 = data3.dropna()

## Derive Normalized Year, Gender, Height, and Weight Columns

In [None]:
def getYear(value):
    return int(re.search('\d{4}',value).group(0))
    
data3['yearFirstAppeared'] = data3['firstAppearanceDate'].apply(getYear)

In [41]:
def MaleFemale(value):
    if (value != 'Male') and (value != 'Female'):
        return -1
    else:
        return value

data3['gender'] = data3['gender'].apply(MaleFemale)

# Removing characters where gender is either missing or listed as agender or fluid, for analytical purposes
data3 = data3[data3['gender'] != -1]
data3 = data3.reset_index()
del data3['index']

In [42]:
def calcHeight(value):
    
    if re.search('(\d)[\',’]\s?(\d{1,2})',value) is None:
        return -1
    
    feet = int(re.search('(\d)[\',’]\s?(\d{1,2})',value).group(1))
    inch = int(re.search('(\d)[\',’]\s?(\d{1,2})',value).group(2))
    return (feet * 12) + inch

data3['heightInInches'] = data3['height'].apply(calcHeight)

In [None]:
# Some manual fixes 
data3['heightInInches'][196] = 67
data3['heightInInches'][1203] = 84
data3['heightInInches'][1213] = 125
data3['heightInInches'][1309] = 72
data3['heightInInches'][1316] = 74
data3['heightInInches'][1367] = 15
data3['heightInInches'][1415] = 86

In [49]:
def calcWeight(value):
    
    if re.search('\d{1,3}',value) is None:
        return -1

    return int(re.search('\d{1,3}',value).group(0))

data3['weightInPounds'] = data3['weight'].apply(calcWeight)

In [6]:
data3 = pd.read_csv("MarvelData9-6-18.csv", encoding = 'latin')
del data3['Unnamed: 0']

## Parsing 'link' Column For Character Name and Universe

In [None]:
def getName(value):
    if re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value) is None:
        return -1
    return re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value).group(1).replace("_"," ")

def getUniverse(value):
    if re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value) is None:
        return -1
    return re.search('wikia.com/wiki/(.{,})_\((.{,})\)',value).group(2).replace("_"," ")

data3['name'] = data3['link'].apply(getName)
data3['universe'] = data3['link'].apply(getUniverse)

data3['name'][915] = 'Telos'
data3['name'][1210] = 'Santa Claus'
data3['name'][1239] = 'Clark Savage, Jr.'

data3 = data3[data3['name'] != -1]
data3 = data3.reset_index()
del data3['index']

In [61]:
data3.tail(2)

Unnamed: 0,link,gender,height,weight,firstAppearance,firstAppearanceDate,yearFirstAppeared,heightInInches,weightInPounds,name,universe
1454,http://dc.wikia.com/wiki/Caitlin_Fairchild_(Wi...,Female,"6' 4""",300 lbs (136 kg),Deathmate Black,1993,1993,76,300,Caitlin Fairchild,Wildstorm Universe
1455,http://dc.wikia.com/wiki/Chaselon_(Green_Lante...,Male,"10' 0""",720 lbs (327 kg),[[Green Lantern (Movie)]],2011,2011,0,720,Chaselon,Green Lantern Movie


In [62]:
data3.to_csv("C:\\Users\\glol7001\\Documents\\DCDataComplete.csv")