<a href="https://colab.research.google.com/github/TetianaHrunyk/NeuralMetaphorsDetection/blob/master/McMillanScrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries and define constants

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import string
import time
import re
import random
import pandas as pd

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
start_url = "https://www.macmillandictionary.com/dictionary/british/splendid"
HEADERS = {'Connection': 'keep-alive',
           'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:62.0) Gecko/20100101 Firefox/62.0'
          }
BASE_URL = "https://www.macmillandictionary.com/dictionary/british/"
MAX_ITER = 400
OUTER_ITER = 1000
stwrds = stopwords.words('english')

backup_links = ["passion", "lust_1", "desire_1", "success", "smila_1", "king", "courtesy_1", "dome", "pea", "princess"]

# Define helping functions

In [None]:
def process_page(soup):
  divs = soup.find_all("div", class_= "toggle-toggle entry-bold flex-extend ONEBOX-HEAD", text="Metaphor")
  output = []
  if divs:
    items = divs[0].find_parent().find_next_siblings("div", class_="hidden-closed")[0].find_all("li")
    for item in items:
      text = item.text
      met = item.a.text
      met_ind = text.find(met)
      temp = text[:met_ind]
      for word in met.split(" "):
        if (word in string.punctuation) or (word in stwrds):
          temp += " "+word
          continue
        temp += " M_"+word
      temp += " "+text[met_ind+len(met):]
      output.append(temp)
  return output


In [None]:
def process_page_get_all(soup):
  divs = soup.find_all("div", class_= "toggle-toggle entry-bold flex-extend ONEBOX-HEAD", text="Metaphor")
  output = []
  if divs:
    items = divs[0].find_parent().find_next_siblings("div", class_="hidden-closed")[0].find_all("li")
    for item in items:
      text = item.text
      met = item.a.text
      met_ind = text.find(met)
      temp = text[:met_ind]
      for word in met.split(" "):
        if (word in string.punctuation) or (word in stwrds):
          temp += " "+word
          continue
        temp += " M_"+word
      temp += " "+text[met_ind+len(met):]
      output.append(temp)
    examples = soup.find_all("p", class_= "EXAMPLE")
    for example in examples:
      output.append(example.get_text())
    random.shuffle(output)
  return output

In [None]:
def get_links(soup):
  links = []
  divs = soup.find_all("div", class_="synonyms row")
  base_len = 64
  for div in divs:
      links += div.find_all("a")
  links = list(map(lambda x: str(x)[base_len: str(x).find("title")-2].split("#")[0], links))
  links_ = []
  for link in links:
    if link not in links_:
      links_.append(link)
  return links_

In [None]:
def scrape(start_url, page_process_func=process_page):
  itr = 0
  page = requests.get(start_url, headers=HEADERS)
  soup = bs(page.content, 'html.parser')
  data = page_process_func(soup)
  links = get_links(soup)
  hist = []
  while links and itr < MAX_ITER:
    itr += 1
    ind = random.randint(0, len(links)-1)
    if itr%100 == 0:
      print("Iter: {}, scrapping {}, data: {}, links: {}".format(itr, BASE_URL+links[ind], len(data), len(links)))
    hist.append(links[ind])
    page = requests.get(BASE_URL+links[ind], headers=HEADERS)
    links.pop(ind)
    soup = bs(page.content, 'html.parser')
    data += page_process_func(soup)
    old_links = links.copy()
    links += [link for link in get_links(soup) if link not in hist+old_links]
    time.sleep(2)
  return data, links


# Run the scrapper and save the data

In [None]:
DATA_len = 0
for i in range(OUTER_ITER):
  DATA = []
  try:
    print("Iter: {}, DATA: {}".format(i, DATA_len))
    if random.random() < 0.4: 
      more_data, links = scrape(start_url)
    else:
      more_data, links = scrape(start_url, process_page_get_all)
    start_url = BASE_URL+random.choice(links)
    DATA += more_data
  except Exception as e:
    print(e)
    try:
      start_url = BASE_URL+random.choice(links)
    except Exception as e:
      print("Level 2 exception: ", e)
      start_url = BASE_URL+random.choice(backup_links)
  df = pd.DataFrame(DATA)
  df.to_csv("/content/drive/MyDrive/MsThesis/McMillanMetaphorsAll.csv", mode="a", index=False, header=False)
  DATA_len += len(DATA)

Iter: 0, DATA: 0
Iter: 100, scrapping https://www.macmillandictionary.com/dictionary/british/individual_2, data: 12, links: 389
Iter: 200, scrapping https://www.macmillandictionary.com/dictionary/british/painstaking, data: 31, links: 787
Iter: 300, scrapping https://www.macmillandictionary.com/dictionary/british/acid_2, data: 86, links: 1071
Iter: 400, scrapping https://www.macmillandictionary.com/dictionary/british/absorbance, data: 125, links: 1353
Iter: 1, DATA: 125
Iter: 100, scrapping https://www.macmillandictionary.com/dictionary/british/come-out, data: 0, links: 615
Iter: 200, scrapping https://www.macmillandictionary.com/dictionary/british/think_1, data: 93, links: 976
Iter: 300, scrapping https://www.macmillandictionary.com/dictionary/british/attenuated, data: 132, links: 1362
Iter: 400, scrapping https://www.macmillandictionary.com/dictionary/british/harmless, data: 158, links: 1528
Iter: 2, DATA: 283
Iter: 100, scrapping https://www.macmillandictionary.com/dictionary/british

# Save the data to a file

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# df = pd.DataFrame(DATA)
# df.to_csv("/content/drive/MyDrive/MsThesis/McMillanMetaphors5.csv")

In [None]:
!wc -l /content/drive/MyDrive/MsThesis/McMillanMetaphorsAll.csv

In [None]:
!head -30 /content/drive/MyDrive/MsThesis/McMillanMetaphorsAll.csv