In [3]:
import nltk
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [4]:
#get data from CBC
url_cbc = "https://www.cbc.ca/"
response = requests.get(url_cbc)

soup = bs(response.content, 'html.parser')

corpus_CBC = []
for article in soup.find_all('div', class_='contentWrapper'):
    title = article.find('h3', class_='headline').text.strip()
    corpus_CBC += [title]


#find sentiment scores built-in analyzer in NLTK's Vader Module
sia = SentimentIntensityAnalyzer()

scores = [sia.polarity_scores(title)['compound'] for title in corpus_CBC]
df = pd.DataFrame({'Titles': corpus_CBC, 'Scores': scores})
print(df)


                                               Titles  Scores
0   Fallen Edmonton police officers were ambushed,... -0.5719
1   Foreign interference is the 'greatest strategi...  0.4939
2   'We were shocked': Int'l med students who fled... -0.2732
3   Fairleigh Dickinson shocks Purdue to become se...  0.0000
4   Actor Lance Reddick, known for The Wire and Jo... -0.6486
5   2 Edmonton police officers were ambushed, shot... -0.2960
6   The International Criminal Court wants Vladimi... -0.7579
7                                        The National  0.0000
8   Teen signs one-day contract with Pittsburgh Pe...  0.0000
9   Parents speak out after son, 5, overdoses due ... -0.4019
10                Canadian 'super pigs' are a problem  0.2960
11  MP's questioning of UWindsor law prof during f... -0.6597
12              Watch CBC News Explore free streaming  0.5106
13  Marketplace: Retail Tricks — putting jeans siz... -0.1280
14  Push: Welcome to the world of the 'Wheelie Peeps'  0.4588
15  A ma

In [5]:
#get data from movie review corpus
file_path = "./mix20_rand700_tokens_cleaned/tokens/"
neg_files = os.listdir(file_path + "neg") #negative
pos_files = os.listdir(file_path + "pos") #positive


In [6]:
def tokenize_files(file_set, path):
    tokenized_data = []
    for file in file_set:
        with open(path + file, 'r' ,encoding='utf-8', errors='ignore') as f:
            content = f.read()
            # tokenize the data
            tokens = nltk.sent_tokenize(content)
            tokenized_data += tokens
    return tokenized_data


In [7]:
neg_data = tokenize_files(neg_files, file_path+"neg/")
pos_data = tokenize_files(pos_files, file_path+"pos/")


In [11]:
scores_pos = [sia.polarity_scores(sentence)['compound'] for sentence in pos_data]
scores_neg = [sia.polarity_scores(sentence)['compound'] for sentence in neg_data]

# We assume that all the sentences in the positive review are positive, and vice versa
false_result = 0
for score in scores_neg:
    if score > 0:
        false_result += 1
for score in scores_pos:
    if score < 0:
        false_result += 1  
        
accuracy = 1 - (false_result / len(scores_neg+scores_pos))
print("Accuracy is {}%".format(accuracy*100))

Accuracy is 69.88380736277924%


In [12]:
average_score_pos = sum(scores_pos) / len(scores_pos)
average_score_neg = sum(scores_neg) / len(scores_neg)
print('Average score for positive data is:', average_score_pos)
print('Average score for negative data is:', average_score_neg)

Average score for positive data is: 0.10574544504643899
Average score for negative data is: 0.01536359860770514
