# Scope labeling

In [18]:
import matplotlib.pyplot as plt
import numpy as np
import time
from deep_translator import GoogleTranslator
from datetime import datetime, timedelta
import parsedatetime as pdt
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import requests
import pandas as pd
import unicodedata
from selenium import webdriver
import re

In [19]:
my_name = 'Lucas' # <-- Modify here
subject = 'Flood' # <-- Modify here
year = 2019 # <-- Modify here

In [20]:
def get_lists_from_subject(subject, num_pages,date_limits = None):

    translator = GoogleTranslator(source='fr', target='en')
    
    cal = pdt.Calendar()
    now = datetime.now()

    PATH = "./chromedriver_win32/chromedriver.exe"

    s=Service(PATH)
    driver = webdriver.Chrome(service=s)

    link_list = []
    date_list = []
    if date_limits is not None:
        lower_date, higher_date = date_limits
        ld, lm, ly = str(lower_date.day), str(lower_date.month), str(lower_date.year)
        hd, hm, hy = str(higher_date.day), str(higher_date.month), str(higher_date.year)
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=APq-WBuYthkpiHNrhk_0YwH1w70zP27Xgg%3A1643812260630&source=lnt&tbs=cdr%3A1%2Ccd_min%3A"+lm+"%2F"+ld+"%2F"+ly+"%2Ccd_max%3A"+hm+"%2F"+hd+"%2F"+hy+"&tbm=nws&hl=en")
    else:   
        driver.get("https://www.google.com/search?q="+subject+"&rlz=1C1CHBF_frFR863FR863&biw=1920&bih=880&sxsrf=AOaemvI0XcPZB9YWw9GUVGwWTEXPDVqRxQ:1638967714934&source=lnms&tbm=nws&sa=X&ved=2ahUKEwjGlsnDntT0AhWTTcAKHeyuDk4Q_AUoAXoECAEQAw&hl=en")

    driver.find_element(By.XPATH, "//button[@class='VfPpkd-LgbsSe VfPpkd-LgbsSe-OWXEXe-k8QpJ VfPpkd-LgbsSe-OWXEXe-dgl2Hf nCP5yc AjY5Oe DuMIQc qfvgSe']").click() #accept google policy

    for i in range(num_pages):
        if i != 0:
            try :
                driver.find_element(By.ID, "pnnext").click()
            except :
                break

        html_source = driver.page_source

        soup = BeautifulSoup(html_source, 'lxml')

        #Getting all g-card 
        g_card_list = soup.find_all("g-card")

        for g_card in g_card_list:
            a = g_card.find("a")
            link = a['href']
            link_list.append(link)

            date = g_card.find_all("span")[-1].text
            translated_date = translator.translate(date)
            date_list.append(cal.parseDT(translated_date, now)[0].date())

    driver.quit()

    print("Successfully scraped : ", len(link_list), " links")

    return link_list, date_list

def get_df_from_link_list(link_list, date_list):

    my_timeout = 10

    data = []

    for i, link in enumerate(link_list):
        d = {}

        try:
            html_text = requests.get(link, timeout=my_timeout).text

            soup = BeautifulSoup(html_text, 'lxml')

            title = soup.find('title')
            if title != None:
                d["Title"] = title.text

            d["Link"] = link
   


            d["Date"] = date_list[i]
            
            article = soup.find('article')
            if article != None:
                paragraphs = article.find_all('p')
                big_p = ""
                for p in paragraphs:
                    big_p = big_p + p.text + " "
                
                if big_p != "":
                    d["Content"] = unicodedata.normalize("NFKD", big_p).rstrip()
        except: #Requests takes way too long or bug
            print('Could not scrap page number ' + str(i) + ', try again another time.')

        data.append(d)

    return pd.DataFrame(data)

def clean(text):

    """Clean the text input"""
    
    # removing paragraph numbers
    text = re.sub('[0-9]+.\t','',str(text))
    # removing new line characters
    text = re.sub('\n ','',str(text))
    text = re.sub('\n',' ',str(text))
    # removing apostrophes
    text = re.sub("'s",'',str(text))
    # removing hyphens
    text = re.sub("-",' ',str(text))
    text = re.sub("— ",'',str(text))
    # removing quotation marks
    text = re.sub('\"','',str(text))
    # removing salutations
    text = re.sub("Mr\.",'Mr',str(text))
    text = re.sub("Mrs\.",'Mrs',str(text))
    # removing any reference to outside text
    text = re.sub("[\(\[].*?[\)\]]", "", str(text))

    return text

def get_df_from_subject(subject,num_pages, date_limits = None):
    link_list, date_list = get_lists_from_subject(subject, num_pages, date_limits)
    return get_df_from_link_list(link_list, date_list)

In [21]:
early_date = datetime(year, 1, 1) 
end_date = datetime(year, 12, 31) 

df = get_df_from_subject(subject,4, date_limits = (early_date, end_date))
df = df.dropna() #Drop missing values
df["Content"] = df["Content"].apply(clean) #cleaning contents
df = df.rename(columns={"Content":"Clean_content"})
print(f'There are {len(df.index)} usable articles')
df['Scope'] = None

Successfully scraped :  40  links
Could not scrap page number 22, try again another time.
There are 25 usable articles


In [22]:
# You will be asked to label articles one by one in the following cell
# Enter 1 if the article mention specif past event linked to your sugject
# Enter 0 otherwise (if the article only mentions predictions about the future for example)

for i in range(len(df)) :
    if df.iloc[i, -1] == None:
        print("Article", i+1, "/", len(df), ':')
        text = df.iloc[i].Clean_content
        print(text)
        print()
        label = input("In Scope ? (0 or 1)")
        df.iloc[i, -1] = label
        print('############')
        print()

Article 1 / 25 :
By Jason Hanna and Marlena Baldacci, CNN  Updated 1235 GMT  March 27, 2019    Farmers in parts of Nebraska and Iowa had precious little time to move themselves from the floodwaters that rushed over their lands last week, so many left their livestock and last year harvest behind. CNN Paul P. Murphy and Katie Lobosco contributed to this report.

############

Article 2 / 25 :

############

Article 3 / 25 :

############

Article 4 / 25 :
By Julia Hollingsworth, CNN  Updated 2126 GMT  November 13, 2019   The worst flooding to hit Venice in more than 50 years has brought the historic city to its knees, its mayor said on Wednesday. Situazione drammatica pic.twitter.com/gS63ZK2j3Q CNN Gianluca Mezzofiore, Laura Perez Maestro and Livia Borghese contributed to this report.

############

Article 5 / 25 :
Rightwing parties reject proposals as lagoon city faces worst flooding in 53 years  Veneto’s regional council rejected a plan to combat climate change minutes before its offi

In [23]:
df2 = df.rename(columns = {'Clean_content' : 'text', 'Scope' : 'label'})
df2 = df2[['text', 'label']]
df2.to_csv('scope_dataset_'+my_name+'_'+subject+'_'+str(year)+'.csv', index = False)

In [24]:
df2.head()

Unnamed: 0,text,label
1,"By Jason Hanna and Marlena Baldacci, CNN Upda...",1.0
3,"‘This is result of climate change,’ says Venic...",1.0
4,This video can not be played Severe flooding i...,
5,"By Julia Hollingsworth, CNN Updated 2126 GMT ...",1.0
7,Rightwing parties reject proposals as lagoon c...,1.0


### (Optionnal) If you want to split your data into train and test

In [17]:
train_test_ratio = 0.6
x = int(len(df2)*train_test_ratio - 0.01) + 1
train = df2.head(x)
test = df2.tail(len(df) - x)

In [18]:
train.to_csv('train_'+my_name+'_'+subject+'.csv', index = False)
test.to_csv('test_'+my_name+'_'+subject+'.csv', index = False)