In [1]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + [word.strip("\n") for word in open("remove_words.txt", "r")])

In [2]:
articles_df = pd.read_csv('all_articles.csv', usecols = ['article_title','text','relevant'])
articles_df

Unnamed: 0,article_title,text,relevant
0,Intervention was the right thing to do,News Intervention was the right thing to do FI...,0
1,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,0
2,Senator Reverend Warnock Introduces Bill to En...,Senator Reverend Warnock Introduces Bill to En...,1
3,10 Markets Where Sellers Are Cutting Home Pric...,Daily 10 Markets Where Sellers Are Cutting Hom...,1
4,"Biden touts Inflation Reduction Act, critics s...","Biden touts Inflation Reduction Act, critics s...",0
...,...,...,...
717,TESCO is set to increase its online delivery c...,TESCO is set to increase its online delivery c...,0
718,"STEVE INSKEEP: In California, homeowners' asso...","STEVE INSKEEP: In California, homeowners' asso...",0
719,Everyone knows to turn off the lights when you...,Everyone knows to turn off the lights when you...,0
720,PORTUGAL has been added to the green list of c...,PORTUGAL has been added to the green list of c...,0


In [3]:
relevant_df = pd.read_csv("relevant_test.csv", usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
relevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,saving,408,0.147359,48.639330,7.167465
1,pension,555,0.127768,45.188023,5.773597
2,retirement,697,0.116695,42.104593,4.913398
3,income,483,0.098807,46.642453,4.608624
4,woman,174,0.104770,40.012908,4.192153
...,...,...,...,...,...
65,husband,43,0.013629,36.832596,0.501977
66,overdraft,18,0.013629,36.532501,0.497888
67,firefighter,20,0.016184,29.802206,0.482318
68,confident,24,0.012777,37.605147,0.480475


In [4]:
irrelevant_df = pd.read_csv("irrelevant_test.csv",  usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
irrelevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,fund,1174,0.799830,53.342988,42.665303
1,company,673,0.423339,52.792186,22.348992
2,market,747,0.522147,42.478356,22.179925
3,service,636,0.388416,52.470224,20.380257
4,investment,774,0.418228,48.667711,20.354213
...,...,...,...,...,...
65,law,180,0.095400,43.227933,4.123960
66,trust,193,0.096252,42.592488,4.099618
67,mortgage,445,0.094549,40.214602,3.802232
68,county,135,0.088586,42.049779,3.725023


In [5]:
articles_df['predicted'] = np.zeros(len(articles_df))
all_articles_relevancy = {}
for index, row in articles_df.iterrows():
    
    #if index == 0:
    all_articles_relevancy[row['article_title']] = 0
    sentences = sent_tokenize(row['text'])

    for sentence in sentences:
        words_in_sentence = list(sentence.split(" "))
        for word_ in words_in_sentence:

            word_ = word_.lower()
            word_ = word_.strip()
            word_ = word_.replace(" ", "")
            word_ = word_.replace(",", "")
            word_ = word_.replace(".", "")
            word_ = word_.replace(":", "")
            word_ = word_.replace("/", "")
            word_ = word_.replace("-", "")
            word_ = word_.replace("(", "")
            word_ = word_.replace(")", "")

            if lemmatizer.lemmatize(word_) != 'ha' and lemmatizer.lemmatize(word_) != 'wa':
                word_ = lemmatizer.lemmatize(word_)

            if word_ not in stop_words and word_ not in string.punctuation:

                if word_ in list(relevant_df['word']):
                    #print("RELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] += float(relevant_df[relevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

                if word_ in list(irrelevant_df['word']):
                    #print("IRRELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] -= float(irrelevant_df[irrelevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

for key in all_articles_relevancy.keys():
    if all_articles_relevancy[key] >= 0:
        articles_df.loc[articles_df['article_title'] == key, ['predicted']] = 1

pd.set_option('display.max_rows', 1000)
articles_df[['relevant', 'predicted']]

Unnamed: 0,relevant,predicted
0,0,0.0
1,0,0.0
2,1,0.0
3,1,0.0
4,0,0.0
5,0,0.0
6,1,1.0
7,0,0.0
8,0,0.0
9,0,0.0


In [6]:
num_ers = 0
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

for index, row in articles_df.iterrows():
    
    if row['relevant'] == 1:
        
        if row['predicted'] == 1:
            true_positive += 1
        
        if row['predicted'] == 0:
            false_negative += 1
    
    if row['relevant'] == 0:
        
        if row['predicted'] == 0:
            true_negative += 1
            
        if row['predicted'] == 1:
            false_positive += 1
        
    
    if row['relevant'] != row['predicted']:
        num_ers += 1

eval_dict = {}

print(f"True positive = {true_positive}")
print(f"True Negative = {true_negative}")
print(f"False_negative = {false_negative}")
print(f"False_positive = {false_positive}")
print(f"Total number of errors = {num_ers}/{len(articles_df)}")
eval_dict['true_positive'] = true_positive
eval_dict['true_negative'] = true_negative
eval_dict['false_positive'] = false_positive
eval_dict['false_negative'] = false_negative
eval_dict['accuracy'] = (true_positive+true_negative)/(true_positive+true_negative+false_negative+false_positive)
eval_dict['precision'] = true_positive / (true_positive + false_positive)
eval_dict['recall'] = true_positive / (true_positive + false_negative)
eval_dict['f1_score'] = (2 * eval_dict['precision'] * eval_dict['recall']) / (eval_dict['precision'] + eval_dict['recall']) 
#print(f"correctly_classified documents  = {correctly_classified}")
print(f"Accuracy = {eval_dict['accuracy']}")
print(f"Precision = {eval_dict['precision']}")
print(f"Recall = {eval_dict['recall']}")
print(f"F1-Score = {eval_dict['f1_score']}")

eval_df = pd.DataFrame (pd.Series(eval_dict)).T
#eval_df.to_csv('results/'+model_name+'/'+model_name+'_metrics.csv')


True positive = 23
True Negative = 456
False_negative = 229
False_positive = 14
Total number of errors = 243/722
Accuracy = 0.6634349030470914
Precision = 0.6216216216216216
Recall = 0.09126984126984126
F1-Score = 0.15916955017301038


In [7]:
len(articles_df[articles_df['relevant']==1]) / len(articles_df) * 100

34.903047091412745