## THE GOAL OF THIS SCRIPT:

Given a link from https://www.citypopulation.de/  
Return a pandas dataframe of the cities with populations and coordinates  
returncoords sub-function: given citypop.de city page, return lat,long coordinates


In [87]:
# Dependencies:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [88]:
# Base URL & Extensions
baseurl = 'https://www.citypopulation.de'
ext_usa_cities = '/en/usa/cities/'
ext_can_cities = '/en/canada/agglo/'

In [83]:
def buildDF(tableURL):
  """
  Given URL of citypopulation.de city page
  Return a pandas dataframe with rows including city name, latest population estimate, and href link to city page
  """
  r = requests.get(tableURL)
  soup = BeautifulSoup(r.text, 'html.parser')
  soup3 = soup.find_all(itemtype="http://schema.org/City")
  df = pd.DataFrame(columns=["citystate", "pop", "link"])
  for i in range(len(soup3)):
    pop = soup3[i].find("td", class_="rpop prio1").text
    pop = int(pop.replace(",",''))
    tree = soup3[i].find_all("td")
    href = tree[-1].find('a')['href']
    name = soup3[i].find("td", class_="rname").text
    df = df.append({'citystate' : name, 'pop' : pop, 'link' : href}, ignore_index = True)
  return df

### Coordinate Extraction

In [84]:
def returncoords(cityurl):
  """
  Given URL of citypopulation.de city page
  Return tuple form, decimal notation (longitude, latitude) coordinates of city
  !!! TO DO: re-write fn using 'http://schema.org/City' geo attr rather than string splitting
  """
  r = requests.get(cityurl)
  soup = BeautifulSoup(r.text, 'html.parser')
  mainscriptTag = soup.find("script")
  checkHREF = list(mainscriptTag.children)[0]
  if not 'window.location.href' in checkHREF:
    longstring = str(mainscriptTag).split(';')[6]
    latstring = str(mainscriptTag).split(';')[7]
    longi = float(longstring.split('=')[-1].strip())
    lat = float(latstring.split('=')[-1].strip())
    return (longi, lat)
  else:
    return None

In [85]:
def buildCoordDF(tableURL):
  '''
  Given URL of citypopulation.de city page
  Build a pandas dataframe including city name, latest population estimate,
  href to city page and tuple form, decimal notation (longitude, latitude) coordinates of city
  Drops redirected city pages
  '''
  df = buildDF(tableURL)
  df['coords'] = df['link'].apply(lambda x: returncoords(baseurl+x))
  return df.dropna()

In [86]:
buildCoordDF(baseurl+ext_usa_cities)

Unnamed: 0,citystate,pop,link,coords
0,Abilene,123420,/en/usa/texas/taylor_jones/4801000__abilene/,"(-99.733, 32.449)"
1,Akron,197597,/en/usa/ohio/summit/3901000__akron/,"(-81.517, 41.085)"
2,Albuquerque,560513,/en/usa/newmexico/bernalillo/3502000__albuquer...,"(-106.652, 35.084)"
3,Alexandria,159428,/en/usa/virginia/alexandria/5101000__alexandria/,"(-77.047, 38.805)"
4,Allen,105623,/en/usa/texas/collin/4801924__allen/,"(-96.67, 33.103)"
...,...,...,...,...
326,Wichita Falls,104683,/en/usa/texas/wichita/4879000__wichita_falls/,"(-98.492, 33.912)"
327,Wilmington,123744,/en/usa/northcarolina/new_hanover/3774440__wil...,"(-77.949, 34.235)"
328,Winston-Salem,247945,/en/usa/northcarolina/forsyth/3775000__winston...,"(-80.244, 36.095)"
329,Worcester,185428,/en/usa/massachusetts/worcester/2582000__worce...,"(-71.802, 42.263)"
