In [1]:
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk import sent_tokenize
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english') + [word.strip("\n") for word in open("remove_words.txt", "r")])

In [2]:
articles_df = pd.read_csv('all_articles.csv', usecols = ['article_title','text','relevant_campbell','relevant_kristen','mismatch'])
articles_df

Unnamed: 0,article_title,text,relevant_campbell,relevant_kristen,mismatch
0,Intervention was the right thing to do,News Intervention was the right thing to do FI...,0,0,0
1,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,SIX STATES FILE A CHALLENGE TO THE BIDEN ADMIN...,0,0,0
2,Senator Reverend Warnock Introduces Bill to En...,Senator Reverend Warnock Introduces Bill to En...,1,0,1
3,10 Markets Where Sellers Are Cutting Home Pric...,Daily 10 Markets Where Sellers Are Cutting Hom...,1,1,0
4,"Biden touts Inflation Reduction Act, critics s...","Biden touts Inflation Reduction Act, critics s...",0,1,1
...,...,...,...,...,...
122,Jobs & Money - Women stand to get a better deal.,Jobs & Money - Women stand to get a better dea...,1,1,0
123,Pension off AVC rules; The Last Word,Pension off AVC rules; The Last Word THE G...,0,1,1
124,"Gold Stocks Favoured As Telecom, Bank Shares Fall",Business; Money Market And Share Market Gold S...,0,0,0
125,Wall of silence over 'flexible' Abbey fund,Wall of silence over 'flexible' Abbey fund ...,0,1,1


In [3]:
relevant_df = pd.read_csv("relevant_test.csv", usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
relevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,saving,341,0.60177,12.44792,7.490784
1,percent,215,0.378761,11.84831,4.487678
2,payment,208,0.366372,11.94429,4.37605
3,contribution,184,0.323894,12.668621,4.103288
4,family,210,0.369912,10.396925,3.845942
5,student,200,0.352212,10.776622,3.79566
6,debt,210,0.369912,9.689887,3.584401
7,age,166,0.292035,11.357697,3.31685
8,employee,152,0.267257,11.324421,3.026527
9,care,139,0.244248,11.927019,2.913148


In [4]:
irrelevant_df = pd.read_csv("irrelevant_test.csv",  usecols = ['word','frequency','frequency_scaled','relevancy_score','weighted_score'])
irrelevant_df

Unnamed: 0,word,frequency,frequency_scaled,relevancy_score,weighted_score
0,market,680,0.628122,11.005293,6.912668
1,price,460,0.424607,12.159565,5.163034
2,investor,414,0.382054,10.834648,4.139417
3,stock,350,0.322849,12.165846,3.927734
4,bank,338,0.311748,10.895,3.396499
5,security,318,0.293247,11.216478,3.289199
6,share,279,0.257169,12.335511,3.172315
7,com,279,0.257169,12.039279,3.096133
8,value,265,0.244218,12.389721,3.025797
9,product,262,0.241443,12.353799,2.98274


In [5]:
articles_df['predicted'] = np.ones(len(articles_df))
all_articles_relevancy = {}
for index, row in articles_df.iterrows():
    
    #if index == 0:
    all_articles_relevancy[row['article_title']] = 0
    sentences = sent_tokenize(row['text'])

    for sentence in sentences:
        words_in_sentence = list(sentence.split(" "))
        for word_ in words_in_sentence:

            word_ = word_.lower()
            word_ = word_.strip()
            word_ = word_.replace(" ", "")
            word_ = word_.replace(",", "")
            word_ = word_.replace(".", "")
            word_ = word_.replace(":", "")
            word_ = word_.replace("/", "")
            word_ = word_.replace("-", "")
            word_ = word_.replace("(", "")
            word_ = word_.replace(")", "")

            if lemmatizer.lemmatize(word_) != 'ha' and lemmatizer.lemmatize(word_) != 'wa':
                word_ = lemmatizer.lemmatize(word_)

            if word_ not in stop_words and word_ not in string.punctuation:

                if word_ in list(relevant_df['word']):
                    #print("RELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] += float(relevant_df[relevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

                if word_ in list(irrelevant_df['word']):
                    #print("IRRELEVANT WORD")
                    #print(word_)
                    all_articles_relevancy[row['article_title']] -= float(irrelevant_df[irrelevant_df['word'] == word_]['weighted_score'])
                    #print(all_articles_relevancy['article_title'])

for key in all_articles_relevancy.keys():
    if all_articles_relevancy[key] < 0:
        articles_df.loc[articles_df['article_title'] == key, ['predicted']] = 0

pd.set_option('display.max_rows', 1000)
articles_df[['relevant_campbell', 'predicted']]

Unnamed: 0,relevant_campbell,predicted
0,0,0.0
1,0,1.0
2,1,1.0
3,1,0.0
4,0,1.0
5,0,0.0
6,1,1.0
7,0,1.0
8,0,0.0
9,0,0.0


In [6]:
num_ers = 0
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

for index, row in articles_df.iterrows():
    
    if row['relevant_campbell'] == 1:
        
        if row['predicted'] == 1:
            true_positive += 1
        
        if row['predicted'] == 0:
            false_negative += 1
    
    if row['relevant_campbell'] == 0:
        
        if row['predicted'] == 0:
            true_negative += 1
            
        if row['predicted'] == 1:
            false_positive += 1
        
    
    if row['relevant_campbell'] != row['predicted']:
        num_ers += 1

eval_dict = {}

print(f"True positive = {true_positive}")
print(f"True Negative = {true_negative}")
print(f"False_negative = {false_negative}")
print(f"False_positive = {false_positive}")
print(f"Total number of errors = {num_ers}/127")
eval_dict['true_positive'] = true_positive
eval_dict['true_negative'] = true_negative
eval_dict['false_positive'] = false_positive
eval_dict['false_negative'] = false_negative
eval_dict['accuracy'] = (true_positive+true_negative)/(true_positive+true_negative+false_negative+false_positive)
eval_dict['precision'] = true_positive / (true_positive + false_positive)
eval_dict['recall'] = true_positive / (true_positive + false_negative)
eval_dict['f1_score'] = (2 * eval_dict['precision'] * eval_dict['recall']) / (eval_dict['precision'] + eval_dict['recall']) 
#print(f"correctly_classified documents  = {correctly_classified}")
print(f"Accuracy = {eval_dict['accuracy']}")
print(f"Precision = {eval_dict['precision']}")
print(f"Recall = {eval_dict['recall']}")
print(f"F1-Score = {eval_dict['f1_score']}")

eval_df = pd.DataFrame (pd.Series(eval_dict)).T
#eval_df.to_csv('results/'+model_name+'/'+model_name+'_metrics.csv')


True positive = 50
True Negative = 33
False_negative = 19
False_positive = 25
Total number of errors = 44/127
Accuracy = 0.6535433070866141
Precision = 0.6666666666666666
Recall = 0.7246376811594203
F1-Score = 0.6944444444444444


In [7]:
labelled_df = pd.read_csv("labelled_1.csv",  usecols = ['Article','snippet','relevant'])
labelled_df.fillna("", inplace=True)
labelled_df['text'] = labelled_df['snippet'] + ' ' + labelled_df['Article']
labelled_df.to_csv('labelled_1.csv')
labelled_df

Unnamed: 0,Article,snippet,relevant,text
0,After years of shift work and increasing work ...,"In British Columbia, Julia, as we'll call her,...",1,"In British Columbia, Julia, as we'll call her,..."
1,CalPERS the largest public pension fund in the...,"""The news comes a few months after the US pens...",0,"""The news comes a few months after the US pens..."
2,YOGHURT SAUCE cup plain non dairy yoghurt tbs ...,These dishes from plant-based food expert Jack...,0,These dishes from plant-based food expert Jack...
3,Although their trip was a holiday not an emigr...,Jan. 19 -- UK-based MediCare International iss...,0,Jan. 19 -- UK-based MediCare International iss...
4,And going to cash can be wise with your indivi...,"Sure, you're jittery. The market has been sell...",0,"Sure, you're jittery. The market has been sell..."
5,IT IS a top up pension paid by the state in ad...,MILLIONS of workers have been told to opt back...,0,MILLIONS of workers have been told to opt back...
6,Advisors are particularly well positioned to p...,Financial advisors are good at reminding clien...,0,Financial advisors are good at reminding clien...
7,The possibility of a deeper bear phase for equ...,Small potatoes. That phlegmatic verdict on the...,0,Small potatoes. That phlegmatic verdict on the...
8,Contact your school s financial aid officer fo...,Here's some help for consolidating those stude...,0,Here's some help for consolidating those stude...
9,Back then Atlanta still was pretty much a back...,"One hundred and fifteen years ago, Henry W. Gr...",0,"One hundred and fifteen years ago, Henry W. Gr..."


In [8]:
labelled_df['predicted'] = np.zeros(len(labelled_df))
all_articles_relevancy = {}
for index, row in labelled_df.iterrows():
    
    #if index == 0:
    all_articles_relevancy[row['snippet']] = 0
    #print(index)
    #print(row['text'])
    sentences = sent_tokenize(row['text'])

    for sentence in sentences:
        words_in_sentence = list(sentence.split(" "))
        for word_ in words_in_sentence:

            word_ = word_.lower()
            word_ = word_.strip()
            word_ = word_.replace(" ", "")
            word_ = word_.replace(",", "")
            word_ = word_.replace(".", "")
            word_ = word_.replace(":", "")
            word_ = word_.replace("/", "")
            word_ = word_.replace("-", "")
            word_ = word_.replace("(", "")
            word_ = word_.replace(")", "")

            if lemmatizer.lemmatize(word_) != 'ha' and lemmatizer.lemmatize(word_) != 'wa':
                word_ = lemmatizer.lemmatize(word_)

            if word_ not in stop_words and word_ not in string.punctuation:

                if word_ in list(relevant_df['word']):
                    
                    all_articles_relevancy[row['snippet']] += float(relevant_df[relevant_df['word'] == word_]['weighted_score'])
                    
                if word_ in list(irrelevant_df['word']):
                    
                    all_articles_relevancy[row['snippet']] -= float(irrelevant_df[irrelevant_df['word'] == word_]['weighted_score'])
                    
for key in all_articles_relevancy.keys():
    if all_articles_relevancy[key] >= 0:
        labelled_df.loc[labelled_df['snippet'] == key, ['predicted']] = 1

pd.set_option('display.max_rows', 1000)
labelled_df[['relevant', 'predicted']]

Unnamed: 0,relevant,predicted
0,1,1.0
1,0,0.0
2,0,1.0
3,0,1.0
4,0,0.0
5,0,1.0
6,0,1.0
7,0,0.0
8,0,1.0
9,0,1.0


In [9]:
num_ers = 0
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

for index, row in labelled_df.iterrows():
    
    if row['relevant'] == 1:
        
        if row['predicted'] == 1:
            true_positive += 1
        
        if row['predicted'] == 0:
            false_negative += 1
    
    if row['relevant'] == 0:
        
        if row['predicted'] == 0:
            true_negative += 1
            
        if row['predicted'] == 1:
            false_positive += 1
        
    
    if row['relevant'] != row['predicted']:
        num_ers += 1

eval_dict = {}

print(f"True positive = {true_positive}")
print(f"True Negative = {true_negative}")
print(f"False_negative = {false_negative}")
print(f"False_positive = {false_positive}")
print(f"Total number of errors = {num_ers}")
eval_dict['true_positive'] = true_positive
eval_dict['true_negative'] = true_negative
eval_dict['false_positive'] = false_positive
eval_dict['false_negative'] = false_negative
eval_dict['accuracy'] = (true_positive+true_negative)/(true_positive+true_negative+false_negative+false_positive)
eval_dict['precision'] = true_positive / (true_positive + false_positive)
eval_dict['recall'] = true_positive / (true_positive + false_negative)
eval_dict['f1_score'] = (2 * eval_dict['precision'] * eval_dict['recall']) / (eval_dict['precision'] + eval_dict['recall']) 
#print(f"correctly_classified documents  = {correctly_classified}")
print(f"Accuracy = {eval_dict['accuracy']}")
print(f"Precision = {eval_dict['precision']}")
print(f"Recall = {eval_dict['recall']}")
print(f"F1-Score = {eval_dict['f1_score']}")

eval_df = pd.DataFrame (pd.Series(eval_dict)).T
#eval_df.to_csv('results/'+model_name+'/'+model_name+'_metrics.csv')


True positive = 145
True Negative = 241
False_negative = 38
False_positive = 177
Total number of errors = 215
Accuracy = 0.6422628951747088
Precision = 0.4503105590062112
Recall = 0.7923497267759563
F1-Score = 0.5742574257425742
