# Proverbs, idioms, expresions and sayings

dataset location: `dataset/sayings, proverbs, idioms/English_phrases_and_sayings.csv`

Scrapping from [phrases.org.uk](https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html).

In [1]:
# Import and prepare
import os

import pandas as pd
import numpy as np
import requests as req
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

from django.conf import settings

In [2]:
# Get html page and pars it
url = "https://www.phrases.org.uk/meanings/phrases-and-sayings-list.html"
response = req.get(url)
if response.status_code == 200:
    print("Success!")
else:
    print(f"Response is not successfull, Code: {response.status_code}")
html_content = BeautifulSoup(response.text, "html.parser")

Success!


In [3]:
# get and clean phrases
quotes = html_content.find_all("p", class_="phrase-list")
size = len(quotes)
cleaned_quotes = [" ".join(str(quotes[i].text).split()) for i in range(size)]
href_quotes = [quotes[i].a["href"] for i in range(size)]

In [None]:
# The base link
BASE_LINK = "https://www.phrases.org.uk/meanings/"


def get_explanations(url):
    response = req.get(BASE_LINK + url)
    html_soup = BeautifulSoup(response.text, "html.parser")

    quote_explanation = html_soup.find_all("p", class_="meanings-body")
    if len(quote_explanation) >= 1:
        quote_explanation = " ".join(str(quote_explanation[0].text).split())
    else:
        quote_explanation = "NO INFORMATION"

    return quote_explanation


# TODO: Remove number_of_quotes to scrap all rows
number_of_quotes = len(quotes)
explanations = [
    get_explanations(i)
    for i in tqdm(href_quotes[:number_of_quotes], position=0, leave=True)
]

In [5]:
# Constructing the final dataframe
quotes_dataframe = pd.DataFrame()
quotes_dataframe["text"] = quotes[:number_of_quotes]
quotes_dataframe["text"] = quotes_dataframe["text"].apply(lambda x: x.text)
quotes_dataframe["explanation"] = explanations
quotes_dataframe["origin"] = "English"

quotes_dataframe.head()

Unnamed: 0,text,explanation,origin
0,A bird in the hand is worth two in the bush,The proverb 'A bird in the hand is worth two i...,English
1,A bolt from the blue,"A complete surprise, like a bolt of lightning ...",English
2,A bunch of fives,"'A bunch of fives' is a slang term for a fist,...",English
3,A chain is only as strong as its weakest link,The proverb 'A chain is only as strong as its ...,English
4,A change is as good as a rest,A change is as good as a rest is a proverb tha...,English


In [45]:
# Save to file
sayings = os.path.join(
    settings.BASE_DIR, "../dataset/sayings, proverbs, idioms/new.csv"
)
quotes_dataframe.to_csv(sayings)