In [15]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + [word.strip("\n") for word in open("remove_words.txt", "r")])

In [16]:
articles_df = pd.read_csv('all_articles.csv', usecols = ['article_title','text','relevant'])
articles_df

Unnamed: 0,article_title,text,relevant
0,Intervention was the right thing to do,News Intervention was the right thing to do FI...,0
1,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,0
2,Senator Reverend Warnock Introduces Bill to En...,Senator Reverend Warnock Introduces Bill to En...,1
3,10 Markets Where Sellers Are Cutting Home Pric...,Daily 10 Markets Where Sellers Are Cutting Hom...,1
4,"Biden touts Inflation Reduction Act, critics s...","Biden touts Inflation Reduction Act, critics s...",0
5,ULTIMUTT DEAL Aldi’s new £40 Specialbuy item i...,"Money,News Money,Aldi,Deals and sales,Dogs,Mon...",0
6,Students react to Biden's student debt relief ...,Students react to Biden's student debt relief ...,1
7,A GOP attorney general says people are 'celebr...,A GOP attorney general says people are 'celebr...,0
8,Energy deal ban: Ofgem green lights major chan...,Energy deal ban: Ofgem green lights major chan...,0
9,Couple who bought B&Q plant for just £12 disco...,Money Couple who bought B&Q plant for just £12...,0


In [17]:
relevant_df = pd.read_csv("relevant_test.csv", usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
relevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,retirement,697,0.593697,42.553239,25.26372
1,tax,627,0.534072,41.630844,22.23385
2,pension,555,0.472743,45.639735,21.575855
3,plan,484,0.412266,49.539084,20.423268
4,income,483,0.411414,48.356872,19.894693
5,rate,405,0.344974,52.045179,17.954257
6,saving,408,0.34753,50.174883,17.437268
7,money,421,0.358603,45.327607,16.254619
8,financial,423,0.360307,44.644221,16.085609
9,pay,362,0.308348,51.447258,15.863635


In [18]:
irrelevant_df = pd.read_csv("irrelevant_test.csv",  usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
irrelevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,fund,1174,1.0,41.370871,41.370871
1,investment,774,0.659284,38.845673,25.61035
2,company,673,0.573254,40.621893,23.286656
3,service,636,0.541738,40.166313,21.759604
4,market,747,0.636286,33.760128,21.481103
5,rate,603,0.513629,40.546618,20.825904
6,time,607,0.517036,38.713024,20.016018
7,plan,556,0.473595,39.202262,18.565978
8,price,522,0.444634,38.949426,17.318228
9,tax,597,0.508518,33.795242,17.185485


In [19]:
articles_df['predicted'] = np.zeros(len(articles_df))
all_articles_relevancy = {}
for index, row in articles_df.iterrows():
    
    #if index == 0:
    all_articles_relevancy[row['article_title']] = 0
    sentences = sent_tokenize(row['text'])

    for sentence in sentences:
        words_in_sentence = list(sentence.split(" "))
        for word_ in words_in_sentence:

            word_ = word_.lower()
            word_ = word_.strip()
            word_ = word_.replace(" ", "")
            word_ = word_.replace(",", "")
            word_ = word_.replace(".", "")
            word_ = word_.replace(":", "")
            word_ = word_.replace("/", "")
            word_ = word_.replace("-", "")
            word_ = word_.replace("(", "")
            word_ = word_.replace(")", "")

            if lemmatizer.lemmatize(word_) != 'ha' and lemmatizer.lemmatize(word_) != 'wa':
                word_ = lemmatizer.lemmatize(word_)

            if word_ not in stop_words and word_ not in string.punctuation:

                if word_ in list(relevant_df['word']):
                    #print("RELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] += float(relevant_df[relevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

                if word_ in list(irrelevant_df['word']):
                    #print("IRRELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] -= float(irrelevant_df[irrelevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

for key in all_articles_relevancy.keys():
    if all_articles_relevancy[key] >= 0:
        articles_df.loc[articles_df['article_title'] == key, ['predicted']] = 1

pd.set_option('display.max_rows', 1000)
articles_df[['relevant', 'predicted']]

Unnamed: 0,relevant,predicted
0,0,0.0
1,0,1.0
2,1,0.0
3,1,0.0
4,0,1.0
5,0,0.0
6,1,1.0
7,0,1.0
8,0,0.0
9,0,0.0


In [20]:
num_ers = 0
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

for index, row in articles_df.iterrows():
    
    if row['relevant'] == 1:
        
        if row['predicted'] == 1:
            true_positive += 1
        
        if row['predicted'] == 0:
            false_negative += 1
    
    if row['relevant'] == 0:
        
        if row['predicted'] == 0:
            true_negative += 1
            
        if row['predicted'] == 1:
            false_positive += 1
        
    
    if row['relevant'] != row['predicted']:
        num_ers += 1

eval_dict = {}

print(f"True positive = {true_positive}")
print(f"True Negative = {true_negative}")
print(f"False_negative = {false_negative}")
print(f"False_positive = {false_positive}")
print(f"Total number of errors = {num_ers}/{len(articles_df)}")
eval_dict['true_positive'] = true_positive
eval_dict['true_negative'] = true_negative
eval_dict['false_positive'] = false_positive
eval_dict['false_negative'] = false_negative
eval_dict['accuracy'] = (true_positive+true_negative)/(true_positive+true_negative+false_negative+false_positive)
eval_dict['precision'] = true_positive / (true_positive + false_positive)
eval_dict['recall'] = true_positive / (true_positive + false_negative)
eval_dict['f1_score'] = (2 * eval_dict['precision'] * eval_dict['recall']) / (eval_dict['precision'] + eval_dict['recall']) 
#print(f"correctly_classified documents  = {correctly_classified}")
print(f"Accuracy = {eval_dict['accuracy']}")
print(f"Precision = {eval_dict['precision']}")
print(f"Recall = {eval_dict['recall']}")
print(f"F1-Score = {eval_dict['f1_score']}")

eval_df = pd.DataFrame (pd.Series(eval_dict)).T
#eval_df.to_csv('results/'+model_name+'/'+model_name+'_metrics.csv')


True positive = 173
True Negative = 338
False_negative = 79
False_positive = 132
Total number of errors = 211/722
Accuracy = 0.7077562326869806
Precision = 0.5672131147540984
Recall = 0.6865079365079365
F1-Score = 0.6211849192100539


In [21]:
len(articles_df[articles_df['relevant']==1]) / len(articles_df) * 100

34.903047091412745