### Using way-back-machine-scraper to retrieve historical text of wikipedia pages
https://github.com/sangaline/wayback-machine-scraper

In [None]:
# - importing packages
import pandas as pd
import numpy as np
import datetime
import os

In [None]:
os.chdir('/content/drive/MyDrive/QMSS-Columbia/Data Visualization Project/')

In [None]:
# - get all new york city areas and their wikipedia pages
# - copy and read in csv file with nyc neighborhood names
df = pd.read_csv("nyc_neighborhoods.csv")
manhattan = df[ df['borough']=='Manhattan'].reset_index(drop=True)
manhattan.head()

Unnamed: 0,neighborhood,borough
0,Battery Park City,Manhattan
1,Beekman Place,Manhattan
2,Carnegie Hill,Manhattan
3,Chelsea,Manhattan
4,Chinatown,Manhattan


In [None]:
manhattan['wiki_url'] = "https://en.wikipedia.org/wiki/" + manhattan['neighborhood'].str.replace(" ", "_")+",_Manhattan"

In [None]:
manhattan.head()

Unnamed: 0,neighborhood,borough,wiki_url
0,Battery Park City,Manhattan,https://en.wikipedia.org/wiki/Battery_Park_Cit...
1,Beekman Place,Manhattan,"https://en.wikipedia.org/wiki/Beekman_Place,_M..."
2,Carnegie Hill,Manhattan,"https://en.wikipedia.org/wiki/Carnegie_Hill,_M..."
3,Chelsea,Manhattan,"https://en.wikipedia.org/wiki/Chelsea,_Manhattan"
4,Chinatown,Manhattan,"https://en.wikipedia.org/wiki/Chinatown,_Manha..."


In [None]:
manhattan['command_line_script'] = "!wayback-machine-scraper " + manhattan['wiki_url'] + " -a '" + manhattan['wiki_url'] + "$' " + "-f 20100101 -t 20210404"

### Compile wayback-machine-scraper command into .sh file for bulk run

In [None]:
with open("command_line.sh", 'a') as f:
    f.write(
        manhattan['command_line_script'].to_string(index = False, header=False)
    )

### Way Back Machine Scrape

In [None]:
# - install and import wayback-machine-scraper
!pip install wayback-machine-scraper

In [None]:
# - to remove all files under directory
#!rm -rfv website/en.wikipedia.org/

In [169]:
# - single scraper command
manhattan['command_line_script'][44]

"!wayback-machine-scraper https://en.wikipedia.org/wiki/Yorkville,_Manhattan -a 'https://en.wikipedia.org/wiki/Yorkville,_Manhattan$' -f 20100101 -t 20210404"

In [None]:
!wayback-machine-scraper https://en.wikipedia.org/wiki/Yorkville,_Manhattan -a 'https://en.wikipedia.org/wiki/Yorkville,_Manhattan$' -f 20100101 -t 20210404

### Extract HTML text data via beautiful soup 

In [None]:
def scraper(snapshot_path):
  from bs4 import BeautifulSoup
  import re
  
  snap = open(snapshot_path)
  soup = BeautifulSoup(snap, 'html.parser')
  tmp_text = soup.findAll('p')
  tmp_text = [word.text for word in tmp_text]
  tmp_text = ' '.join(tmp_text)
  tmp_text = re.sub('\W+', ' ', re.sub('xa0', ' ', tmp_text))

  return tmp_text

In [None]:
test = scraper("/content/drive/MyDrive/QMSS-Columbia/Data Visualization Project/website/en.wikipedia.org/wiki/Battery_Park_City,_Manhattan/20091215230507.snapshot")

In [None]:
test

'Battery Park City is a 92 acre 0 4 km² planned community at the southwestern tip of lower Manhattan in New York City United States The land upon which it stands was created on the Hudson River using 1 2 million cubic yards 917 000 m3 of dirt and rocks excavated during the construction of the World Trade Center and certain other construction projects as well as from sand dredged from New York Harbor off Staten Island 1 The neighborhood which is the site of the World Financial Center along with numerous housing commercial and retail buildings is named for adjacent Battery Park Battery Park City is owned and managed by the Battery Park City Authority BPCA a public benefit corporation created by New York State under the authority of the Urban Development Corporation 2 Excess revenue from the area was to be contributed to other housing efforts typically low income projects in the Bronx and Harlem Under the 1989 agreement between the BPCA and the City of New York 600 million was transferred

### Scraping in Bulk

In [107]:
def scraper(snapshot_path, name):
  from bs4 import BeautifulSoup
  import re
  import os
  
  df = pd.DataFrame()

  for file in os.listdir(snapshot_path):
    with open(os.path.join(snapshot_path, file)) as text_file:
      soup = BeautifulSoup(text_file, 'html.parser')
      tmp_text = soup.findAll('p')
      tmp_text = [word.text for word in tmp_text]
      tmp_text = ' '.join(tmp_text)
      tmp_text = re.sub('\W+', ' ', re.sub('xa0', ' ', tmp_text))

      df = df.append({'neighborhood': name,
                      'date': str(file),
                      'text': tmp_text}, ignore_index=True)

  return df

In [108]:
test = scraper("website/en.wikipedia.org/wiki/Battery_Park_City,_Manhattan", "Battery Park City")

In [171]:
test.head()

Unnamed: 0,date,neighborhood,text
0,20210125025513.snapshot,Battery Park City,Battery Park City is a mainly residential 92 ...
1,20200919065735.snapshot,Battery Park City,Battery Park City is a mainly residential 92 ...
2,20170613223016.snapshot,Battery Park City,Coordinates 40 42 46 N 74 00 56 W 40 712687 N ...
3,20161207201745.snapshot,Battery Park City,Coordinates 40 42 46 N 74 00 56 W 40 712687 N ...
4,20161023021820.snapshot,Battery Park City,Coordinates 40 42 46 N 74 00 56 W 40 712687 N ...


In [174]:
manhattan['path'] = "website/en.wikipedia.org/wiki/" + manhattan['neighborhood'].str.replace(" ", "_")+",_Manhattan"
manhattan.head()

Unnamed: 0,neighborhood,borough,wiki_url,command_line_script,path
0,Battery Park City,Manhattan,https://en.wikipedia.org/wiki/Battery_Park_Cit...,!wayback-machine-scraper https://en.wikipedia....,website/en.wikipedia.org/wiki/Battery_Park_Cit...
1,Beekman Place,Manhattan,"https://en.wikipedia.org/wiki/Beekman_Place,_M...",!wayback-machine-scraper https://en.wikipedia....,"website/en.wikipedia.org/wiki/Beekman_Place,_M..."
2,Carnegie Hill,Manhattan,"https://en.wikipedia.org/wiki/Carnegie_Hill,_M...",!wayback-machine-scraper https://en.wikipedia....,"website/en.wikipedia.org/wiki/Carnegie_Hill,_M..."
3,Chelsea,Manhattan,"https://en.wikipedia.org/wiki/Chelsea,_Manhattan",!wayback-machine-scraper https://en.wikipedia....,"website/en.wikipedia.org/wiki/Chelsea,_Manhattan"
4,Chinatown,Manhattan,"https://en.wikipedia.org/wiki/Chinatown,_Manha...",!wayback-machine-scraper https://en.wikipedia....,"website/en.wikipedia.org/wiki/Chinatown,_Manha..."


In [178]:
df_out = pd.DataFrame()
for i in range(len(manhattan)):
  try:
    print(manhattan['neighborhood'][i])
    df = scraper(manhattan['path'][i], manhattan['neighborhood'][i])
    df_out = df_out.append(df)
    print(df_out.shape)
  except FileNotFoundError:
    pass

Battery Park City
(53, 3)
Beekman Place
Carnegie Hill
(62, 3)
Chelsea
(208, 3)
Chinatown
(485, 3)
Civic Center
(563, 3)
Clinton
(567, 3)
East Harlem
(581, 3)
East Village
(786, 3)
Financial District
(982, 3)
Flatiron
(985, 3)
Gramercy
(1019, 3)
Greenwich Village
(1053, 3)
Hamilton Heights
(1120, 3)
Harlem (Central)
Herald Square
Hudson Square
Inwood
Lenox Hill
Lincoln Square
(1189, 3)
Little Italy
(1362, 3)
Lower East Side
(1488, 3)
Manhattan Valley
Manhattanville
(1557, 3)
Midtown South
Midtown
(1578, 3)
Morningside Heights
(1645, 3)
Murray Hill
(1749, 3)
NoHo
(1792, 3)
Roosevelt Island
(1795, 3)
SoHo
(1950, 3)
South Village
Stuyvesant Town
Sutton Place
(2027, 3)
Times Square
Tribeca
(2043, 3)
Tudor City
Turtle Bay
(2117, 3)
Union Square
(2212, 3)
Upper East Side
(2230, 3)
Upper West Side
(2241, 3)
Wall Street
Washington Heights
(2423, 3)
West Village
(2442, 3)
Yorkville
(2529, 3)


In [180]:
df_out.to_csv("TimeMatchine.csv")