In [1]:
import pandas as pd
import urllib.request,sys,time
from bs4 import BeautifulSoup
import requests
import csv

In [2]:
df = pd.read_csv('cna_urls_combined.csv')
df

Unnamed: 0,Page,URL
0,0,https://www.channelnewsasia.com/news/singapore...
1,0,https://www.channelnewsasia.com/news/sport/ten...
2,0,https://www.channelnewsasia.com/news/singapore...
3,0,https://www.channelnewsasia.com/news/lifestyle...
4,0,https://www.channelnewsasia.com/news/singapore...
...,...,...
230,23,https://www.channelnewsasia.com/news/cnainside...
231,23,https://www.channelnewsasia.com/news/commentar...
232,23,https://www.channelnewsasia.com/news/commentar...
233,23,https://www.channelnewsasia.com/news/cnainside...


In [3]:
# title, url, date, text, covid (TRUE/FALSE values) comments (if any), tags

def get_title(soup):
    title = soup.find('title').text
    return title

def get_date(soup):
    if len(soup.select('time.article__details-item')) == 0:
        article_date = soup.select("time.video-stage__details-item")[0].text
    else:
        article_date = soup.select('time.article__details-item')[0].text
    date = pd.to_datetime(article_date, infer_datetime_format=True).strftime("%d/%m/%Y")
    return date
        
def get_text(soup):
    text = ''
    article = soup.find("div", attrs={"class": "c-rte--article"})
    if article ==  None:
        article = soup.find("div", attrs={"class": "c-rte--light"})
    articleParagraph = article.find_all("p")
    for i in range(len(articleParagraph)):
#             print(articleParagraph[i].text)
        text += articleParagraph[i].text + '\n'
    return text

def get_tags(soup):
    tags_list = []
    if len(soup.findAll('ul', attrs={'class':"link-list__list"})) == 1:
        tags = soup.findAll('a', attrs={'class':"link-list__link text-white"})
        for tag in tags:
            tags_list.append(tag.text)
    else:
        tags = soup.findAll('ul', attrs={'class':"link-list__list"})[1]
        for tag in tags.find_all("a"):
            tags_list.append(tag.text)
    return tags_list

def covid_related(title, content):
    covid_related = False
    matches = ['covid-19','pandemic','restriction','social distancing','circuit breaker','pre-covid','coronavirus','quarantine','lockdown','wfh']
    if any(x in title.lower() for x in matches) or any(x in content.lower() for x in matches):
        covid_related = True
    return covid_related

def sg_related(title, content):
    sg_related = False
    if 'singapore' in title.lower() or 'singapore' in content.lower():
        sg_related = True
    return sg_related

In [4]:
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}

def get_url_data(url):
    
    url_data = {}
#     print(url)
    
    if 'cna-lifestyle' not in url and 'cna-luxury' not in url:
        
        result = requests.get(url, headers=headers)
        
        if result.status_code != 404:
        
            soup=BeautifulSoup(result.text,'html.parser')

            title = get_title(soup)
            url_data['title'] = title

            url_data['date'] = get_date(soup)

            text = get_text(soup)
            url_data['text'] = text
        #     print(text)

            url_data['covid'] = covid_related(title, text)

            url_data['tags'] = get_tags(soup)

            url_data['sg'] = sg_related(title, text)
    
#     print(url_data)
    return url_data


In [5]:
df['data'] = df.apply(lambda x: get_url_data(x['URL']), axis=1)

df_backup = df.copy()

df = df.join(pd.DataFrame(df.pop('data').values.tolist()))

df

Unnamed: 0,Page,URL,title,date,text,covid,tags,sg
0,0,https://www.channelnewsasia.com/news/singapore...,‘Mental health is everyone’s business’: How so...,09/07/2021,"SINGAPORE: Amid a rapidly ageing society, the ...",True,"[COVID-19, elderly, mental health]",True
1,0,https://www.channelnewsasia.com/news/sport/ten...,Tennis-'Athletes are humans': Osaka stands by ...,08/07/2021,REUTERS: World number two Naomi Osaka said...,False,"[sports, mental health, Japan, French Open, Os...",False
2,0,https://www.channelnewsasia.com/news/singapore...,Suicide cases in Singapore highest in 8 years ...,08/07/2021,SINGAPORE: Singapore reported 452 suicides las...,True,"[suicide, mental health, Samaritans of Singapo...",True
3,0,https://www.channelnewsasia.com/news/lifestyle...,The number of young children who need mental h...,03/07/2021,"When Marie, 11, called a suicide prevention ho...",True,"[Wellness, wellbeing, mental health, children]",False
4,0,https://www.channelnewsasia.com/news/singapore...,AI in mental health screening: System enables ...,30/06/2021,Mental health screening is getting a boost fro...,False,"[mental health, AI]",False
...,...,...,...,...,...,...,...,...
230,23,https://www.channelnewsasia.com/news/cnainside...,"In a mental ward for nearly 35 years, he paint...",15/07/2018,"SINGAPORE: For nearly 35 years, Mr Sim Kah Lim...",False,"[art, mental health, CNA Insider]",True
231,23,https://www.channelnewsasia.com/news/commentar...,Commentary: We still fail to understand that s...,08/07/2018,SINGAPORE: Man is an amazing creature. We have...,False,"[mental health, bipolar disorder, schizophreni...",True
232,23,https://www.channelnewsasia.com/news/commentar...,"Commentary: Schizophrenia, a life increasingly...",09/06/2018,SINGAPORE: It was a cold night in January. A s...,False,"[mental health, schizophrenia, IMH, depression...",True
233,23,https://www.channelnewsasia.com/news/cnainside...,"He’s 48, and already grappling with dementia -...",07/06/2018,SINGAPORE: He was once an accomplished teacher...,False,"[mental health, CNA Insider]",True


In [6]:
df = df[df.sg == True]
del df['sg']

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 0 to 234
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Page    106 non-null    int64 
 1   URL     106 non-null    object
 2   title   106 non-null    object
 3   date    106 non-null    object
 4   text    106 non-null    object
 5   covid   106 non-null    object
 6   tags    106 non-null    object
dtypes: int64(1), object(6)
memory usage: 6.6+ KB


In [7]:
df

Unnamed: 0,Page,URL,title,date,text,covid,tags
0,0,https://www.channelnewsasia.com/news/singapore...,‘Mental health is everyone’s business’: How so...,09/07/2021,"SINGAPORE: Amid a rapidly ageing society, the ...",True,"[COVID-19, elderly, mental health]"
2,0,https://www.channelnewsasia.com/news/singapore...,Suicide cases in Singapore highest in 8 years ...,08/07/2021,SINGAPORE: Singapore reported 452 suicides las...,True,"[suicide, mental health, Samaritans of Singapo..."
7,0,https://www.channelnewsasia.com/news/commentar...,Commentary: Why some night owls are flourishin...,25/06/2021,"SINGAPORE: To paraphrase Linda Evangelista, th...",True,"[sleep, rest, stress, mental health]"
9,0,https://www.channelnewsasia.com/news/commentar...,Commentary: Workers appreciate mental health d...,23/06/2021,SINGAPORE: You may have read about Naomi Osaka...,True,"[Mental Health, Work, COVID-19]"
10,1,https://www.channelnewsasia.com/news/commentar...,Commentary: Revenge bedtime procrastination in...,15/06/2021,SINGAPORE: The benefits of good sleep are well...,False,"[sleep, Netflix, e-devices, mobile phones, str..."
...,...,...,...,...,...,...,...
230,23,https://www.channelnewsasia.com/news/cnainside...,"In a mental ward for nearly 35 years, he paint...",15/07/2018,"SINGAPORE: For nearly 35 years, Mr Sim Kah Lim...",False,"[art, mental health, CNA Insider]"
231,23,https://www.channelnewsasia.com/news/commentar...,Commentary: We still fail to understand that s...,08/07/2018,SINGAPORE: Man is an amazing creature. We have...,False,"[mental health, bipolar disorder, schizophreni..."
232,23,https://www.channelnewsasia.com/news/commentar...,"Commentary: Schizophrenia, a life increasingly...",09/06/2018,SINGAPORE: It was a cold night in January. A s...,False,"[mental health, schizophrenia, IMH, depression..."
233,23,https://www.channelnewsasia.com/news/cnainside...,"He’s 48, and already grappling with dementia -...",07/06/2018,SINGAPORE: He was once an accomplished teacher...,False,"[mental health, CNA Insider]"


In [None]:
# df.to_csv('cna_articles.csv', index=False)