# Necessary import for getting directory and filenames

In [1]:
import os
import requests
import pandas as pd
import html
import feedparser
import re
import textacy
import spacy
import textacy.preprocessing as tprep
from bs4 import BeautifulSoup

# Use BeautifulSoup to extract, for each news article, the title/headline, the publication time, and the text. 

In [2]:
## URL from where all the news articles need to be downloaded
feed_reuters = feedparser.parse(
    'http://web.archive.org/web/20200613003232if_/http://feeds.reuters.com/Reuters/worldNews')

## directory in where all the html files are going to be piled up
CA_DIR = os.getcwd() + "/Sultana-CA03" 

info = [] #empty list for saving title, publication time and content of each news article

for url in feed_reuters.entries[:]:
    file_name = url.id.split("/")[-1] + ".html"
    html = requests.get(url.id)
    path = os.path.join(CA_DIR, file_name)
    
    with open(path, "w+") as f: ### w+ opens a file for both writing and reading
        f.write(html.text) 
        f.close()
    soup = BeautifulSoup(html.text, 'html.parser') ### read in an HTML file and work on it
    
    for title in soup.find_all('title'):
        tag_info = title.get_text() #extract title name
        
        ptime = soup.find("meta", { 'property': "og:article:published_time"})['content'] #extract publication time
            
        text = soup.get_text() #extract text
        
        details = {"Title":tag_info,
                    "Pucblication Time":ptime,
                  "Content":text}
        info.append(details)

# Store the extracted information into a Pandas DataFrame so that each news article is an entry and the columns include Title, Time, Content. 

In [3]:
data_raw= pd.DataFrame (info)
print (data_raw) #print the dataframe with raw contents

## Raw DF is then converted to a .csv file for the convenience of further analysis
data_raw.to_csv("./data_raw.csv") 
root_dir = os.getcwd()
file_dir = root_dir + "/data_raw.csv"
df = pd.read_csv(file_dir)

                                                Title     Pucblication Time  \
0   Mexico City to begin gradual exit from lockdow...  2020-06-13T00:25:34Z   
1   Mexico reports record tally of 5,222 new coron...  2020-06-13T00:16:17Z   
2   Venezuela top court names new electoral counci...  2020-06-12T23:38:23Z   
3   One-fifth of Britain's coronavirus patients we...  2020-06-12T23:02:17Z   
4   France to lift border controls for EU travelle...  2020-06-12T22:58:37Z   
5   Brazil's COVID-19 deaths surge past UK, WHO sa...  2020-06-12T20:03:58Z   
6   Canada's Trudeau calls arrest video of indigen...  2020-06-12T19:26:13Z   
7   Egypt registers highest daily rise in coronavi...  2020-06-12T22:10:31Z   
8   Artists around the world pay tribute to George...  2020-06-12T21:45:49Z   
9   Brazil's COVID-19 death toll passes Britain, w...  2020-06-12T21:56:06Z   
10  Lebanon protesters burn roads, clash with secu...  2020-06-12T21:13:32Z   
11  'Stop buying social peace at our expense', Fre..

# On the above dataset, cleanup the data using the clean and normalize functions 

In [4]:
if textacy.__version__ < '0.11':
    def normalize(text):
        text = tprep.normalize_hyphenated_words(text)
        text = tprep.normalize_quotation_marks(text)
        text = tprep.normalize_unicode(text)
        text = tprep.remove_accents(text)
        return text
else:
    # adjusted to textacy 0.11. Note, function names are changed
    def normalize(text):
        text = tprep.normalize.hyphenated_words(text)
        text = tprep.normalize.quotation_marks(text)
        text = tprep.normalize.unicode(text)
        text = tprep.remove.accents(text)
        return text

In [5]:
# ### Standard cleaning function
def clean(text):
    # convert html escapes like & to characters.
    try: 
        text = html.unescape(text) 
    except AttributeError:
        print("Attribute error: ignored plz")
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    # removing mentions 
    text = re.sub("@\S+", "", text)
    text = re.sub('[-%!@#$]', '', text)
    text = re.sub("@[A-Za-z0-9]+","",text)
    #Removing numerical data
    text = re.sub(r'\d+','',text)
    #Removing currencies 
    text = re.sub(r'[\$\d+\d+\$]','',text)
    #Handling all date formats
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    #Removing a hyperlink
    text = re.sub(r'https?:\/\/.*[\r\n]*', '',text)
    #Extracting the main domain name of a URL
#     text = re.search(r'[\.\/]+(.*)\.',text)
#     #Removing all punctuation
    text = re.sub(r'[^a-z0-9A-Z_]',' ',text)
    return text.strip()

In [6]:
df["Content"] = df["Content"].map(lambda x: clean(x))       #Clean the Content column
df["Content"] = df["Content"].map(lambda x: normalize(x))   #Normalise the Content column

##Store the contents in a list named 'txt' for further processing
txt = []
for row in df["Content"]:
    txt.append(row)

Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz
Attribute error: ignored plz


# for each article, lemmas, nounds, noun-phrases, and entity pairs. Combine them into a dictionary and save it to a file in a format of your choice

In [7]:
nlp = spacy.load("en_core_web_sm") ##English pipeline optimized

In [8]:
##Convert each article content into spacy defined pipeline
doc = []
for j in range(0,len(txt)):
    d = nlp(txt[j])
    doc.append(d)

In [9]:
##Function for generating Lemma, POS, Noun_Phrase and Entity Pairs for each article and 
##save them in a dict called 'row' then this dict is converted into dataframe df_cln

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'Content':doc,
                   'token':i,
                   'text': t.text, 'lemma_': t.lemma_, 
                   'Part of Speech': t.pos_, "Noun Phrase":{chunk.text for chunk in doc.noun_chunks}, 
                   "Entity Pairs":{ent.text for ent in doc.ents}}
            rows.append(row)
    
    df_cln = pd.DataFrame(rows).set_index('token')
    df_cln.index.name = "Index"
    return df_cln

In [10]:
##Pass each content into the above function and save it into a .csv file. For overwrite mode = 'a' is chosen
for t in range (len(doc)):
    df_up = display_nlp(doc[t])
    df_up.to_csv('./data_category.csv',mode='a',index=False )

In [11]:
##Visualize the .csv file
categorized_file = root_dir + "/data_category.csv" #file_location
df_segmented = pd.read_csv(categorized_file) #read the file

In [12]:
df_segmented.head(5) #Show top 5 rows

Unnamed: 0,Content,text,lemma_,Part of Speech,Noun Phrase,Entity Pairs
0,Mexico City to begin gradual exit from lockdow...,Mexico,Mexico,PROPN,"{'gradual exit', 'Friday', 'very orderly trans...","{'Thursday', 'Friday', 'minutes', 'Raul Cortes..."
1,Mexico City to begin gradual exit from lockdow...,City,City,PROPN,"{'gradual exit', 'Friday', 'very orderly trans...","{'Thursday', 'Friday', 'minutes', 'Raul Cortes..."
2,Mexico City to begin gradual exit from lockdow...,to,to,PART,"{'gradual exit', 'Friday', 'very orderly trans...","{'Thursday', 'Friday', 'minutes', 'Raul Cortes..."
3,Mexico City to begin gradual exit from lockdow...,begin,begin,VERB,"{'gradual exit', 'Friday', 'very orderly trans...","{'Thursday', 'Friday', 'minutes', 'Raul Cortes..."
4,Mexico City to begin gradual exit from lockdow...,gradual,gradual,ADJ,"{'gradual exit', 'Friday', 'very orderly trans...","{'Thursday', 'Friday', 'minutes', 'Raul Cortes..."


In [13]:
df_segmented.tail(5) #show last 5 rows

Unnamed: 0,Content,text,lemma_,Part of Speech,Noun Phrase,Entity Pairs
8281,Coronavirus hitting the Americas hardest says ...,,,SPACE,"{'exchanges', 'other parts', 'the global hotsp...","{'Americas', 'Friday', 'minutes', 'four', 'Nor..."
8282,Coronavirus hitting the Americas hardest says ...,All,all,DET,"{'exchanges', 'other parts', 'the global hotsp...","{'Americas', 'Friday', 'minutes', 'four', 'Nor..."
8283,Coronavirus hitting the Americas hardest says ...,Rights,Rights,PROPN,"{'exchanges', 'other parts', 'the global hotsp...","{'Americas', 'Friday', 'minutes', 'four', 'Nor..."
8284,Coronavirus hitting the Americas hardest says ...,Reserved,Reserved,PROPN,"{'exchanges', 'other parts', 'the global hotsp...","{'Americas', 'Friday', 'minutes', 'four', 'Nor..."
8285,Coronavirus hitting the Americas hardest says ...,forphoneonlyfortabletportraitupfortabletlandsc...,forphoneonlyfortabletportraitupfortabletlandsc...,NOUN,"{'exchanges', 'other parts', 'the global hotsp...","{'Americas', 'Friday', 'minutes', 'four', 'Nor..."
