In [9]:
# https://www.kaggle.com/ryanxjhan/cbc-news-coronavirus-articles-march-26

import re
import json
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 

root_path = 'c:/py/covid19/data/'
df = pd.read_csv(root_path+'news.csv')
#df.columns = ['id','char','text','ep']
df.tail()

Unnamed: 0.1,Unnamed: 0,authors,title,publish_date,description,text,url
3561,4604,['The Associated Press'],South Korea declares end to MERS virus outbreak,2015-12-23 09:07:00,South Korea on Wednesday declared a formal end...,South Korea on Wednesday declared a formal end...,https://www.cbc.ca/news/health/korea-mers-outb...
3562,4605,['The Canadian Press'],MERS virus spread in hospitals should be focus...,2014-01-28 16:07:00,The spread of MERS coronavirus infections in h...,The spread of MERS coronavirus infections in h...,https://www.cbc.ca/news/health/mers-virus-spre...
3563,4606,['Cbc News'],Ellershausen Manor is for sale,2012-11-02 01:24:00,A clasic old home with a lot of history in now...,"It's a relic of a bygone era, when wealthy Nov...",https://www.cbc.ca/news/canada/nova-scotia/ell...
3564,4607,['Cbc News'],MERS virus from camels and humans called indis...,2014-04-29 16:57:00,Camels have been implicated in a Middle Easter...,Camels have been implicated in a Middle Easter...,https://www.cbc.ca/news/health/mers-virus-from...
3565,4608,['The Canadian Press'],"WHO team, including Canadian, investigates how...",2013-06-04 21:35:00,A World Health Organization-led group of exper...,A World Health Organization-led group of exper...,https://www.cbc.ca/news/health/who-team-includ...


In [10]:
def clean_dialog(s):
    # replace special characters with space, remove underlines 
    # replace blank spaces with single blank space
    clean = re.sub('[^\w\s]', '', s)
    clean = re.sub('_', '', clean)
    clean = re.sub('\s+', ' ', clean)
    clean = clean.lower()
    return clean

stop_words = set(stopwords.words('english'))
stop_words_ext = ['na', 'da', 'dot', 'doo', 'a-wimowheh', 'parum', 'pum']

def remove_stopwords(words):
    i = []
    for word in words: 
        if word not in stop_words and word not in stop_words_ext: 
            i.append(word) 
    return ' '.join(i)

In [11]:
sent_token, sent_qty, word_token, word_qty, clean_txt = [],[],[],[],[]

for i, row in df.iterrows():
    # build tokens
    tokenized_text = sent_tokenize(str(row.text))
    tokenized_word = word_tokenize(clean_dialog(str(row.text)))
    # append to lists
    sent_token.append(tokenized_text)
    sent_qty.append(len(tokenized_text))
    word_token.append(tokenized_word)
    word_qty.append(len(tokenized_word))
    clean_txt.append(remove_stopwords(tokenized_word))

In [None]:
analyser = SentimentIntensityAnalyzer()

neg, neu, pos, com = [], [], [], []

for i, row in df.iterrows():
    # generate polarity score
    score = analyser.polarity_scores(str(row.text))
    # append to lists
    neg.append(score.get('neg'))
    neu.append(score.get('neu'))
    pos.append(score.get('pos'))
    com.append(score.get('compound'))

In [None]:
df['clean_txt'] = clean_txt
df['sentences'] = sent_token
df['words'] = word_token
df['sentences_qty'] = sent_qty
df['words_qty'] = word_qty
# sentiment analysis fields
df['negative']=neg
df['neutral']=neu
df['positive']=pos
df['compound']=com

df.head()

In [None]:
df.to_csv('c:/py/covid19/data/cbc_new_features.csv', sep=';', encoding='utf-16', index=False)
df.info()