In [142]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import ast

In [138]:
df_path = "data/british_council/articles.csv"

# Import Data

In [134]:
def import_britishcouncil_articles():
    base_url = 'http://learnenglishteens.britishcouncil.org'
    url = base_url + '/study-break/graded-reading?utm_source=facebook&utm_medium=social&utm_campaign=bc-learnenglishteens'
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    page = requests.get(url, headers=headers)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    filters = soup.select("div.views-row a")
    
    articles = []
    art_set = set()
    print("Select Articles...")
    for article in filters:
        article_href = article.get_attribute_list("href")[0]
        article_name = article_href.split("/")[-1][:-2]
        article_level = article_href[-1]
        if "1" == article_level and article_name not in art_set: 
            articles += [(article_href[:-2],article_name)]
            art_set.update([article_name])
            
    texts = []
    print("Get Texts...")
    for article in tqdm(articles):
        text_levels = []
        for level in ["-1","-2","-3"]:
            article_page = requests.get(base_url + article[0] + level, headers=headers)
            article_soup = BeautifulSoup(article_page.content, 'html.parser')
            text_parts = [text_part.get_text() for text_part in article_soup.select("div.field-name-body p")]
            text_levels += [text_parts]
        texts += [(article[1],*text_levels)]
        
    return pd.DataFrame(data=texts,columns=["article_name","raw1","raw2","raw3"])

In [165]:
df = import_britishcouncil_articles()

  0%|          | 0/22 [00:00<?, ?it/s]

Select Articles...
Get Texts...


100%|██████████| 22/22 [00:10<00:00,  2.15it/s]


In [166]:
df.head()

Unnamed: 0,article_name,raw1,raw2,raw3
0,walk-forest-level,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...
1,amazing-adventurers-level,[Do you ever dream about climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...
2,animals-city-level,"[One night in December 2011, a bear came into ...","[Recently, there have been many reports in new...","[Small animals like birds, squirrels, mice and..."
3,bully-level,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...
4,cheat-level,[Mo was waiting outside her class. She was fee...,[Mo was waiting in the corridor outside her cl...,[Mo was waiting in the corridor with her class...


In [183]:
def save_df(path,df):
    df.to_csv(path,sep="|",index=False)

In [167]:
save_df(df_path,df)

# Prepare Data

In [140]:
def load_df(path,list_cols):
    df = pd.read_csv(path,sep="|")
    df[list_cols] = df[list_cols].applymap(
    lambda x: ast.literal_eval(x))
    return df

In [175]:
df = load_df(df_path,["raw1","raw2","raw3"])
df.head()

Unnamed: 0,article_name,raw1,raw2,raw3
0,walk-forest-level,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...,[Going through the forest is my favourite part...
1,amazing-adventurers-level,[Do you ever dream about climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...,[Have you ever dreamt of climbing Mount Everes...
2,animals-city-level,"[One night in December 2011, a bear came into ...","[Recently, there have been many reports in new...","[Small animals like birds, squirrels, mice and..."
3,bully-level,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...,[Kay got another message as she was leaving fo...
4,cheat-level,[Mo was waiting outside her class. She was fee...,[Mo was waiting in the corridor outside her cl...,[Mo was waiting in the corridor with her class...


In [179]:
replace_strings = ["\xa0\n\t","\n\t","\xa0"]

In [180]:
def clean_data(replace_strings,raw_text):
    for string in replace_strings:
        raw_text = [part.replace(string," ") for part in raw_text]
    return raw_text

In [182]:
df[["cleaned1","cleaned2","cleaned3"]] = df[["raw1","raw2","raw3"]].applymap(lambda x: clean_data(replace_strings,x))

In [184]:
df_cleaned_path = "data/british_council/cleaned_articles.csv"
save_df(df_cleaned_path,df[df])

TypeError: save_df() missing 2 required positional arguments: 'path' and 'df'