# Analysis on predicted values

In [1]:
import pandas as pd
from scipy.stats import ttest_ind
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def scores(y, y_pred):
    accu = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    print('Accuracy: ', round(accu, 2))
    print('Precision: ', round(prec, 2))
    print('Recall: ', round(rec, 2))
    print("F1 Score: ", round(f1, 2))

In [3]:
# pred_finance = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\finance_predicted_LR1.csv')
# pred_sport = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\sport_predicted_LR1.csv')
# pred_election = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\election_predicted_LR1.csv')
# pred_general = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\general_predicted_LR1.csv')

In [3]:
pred_finance = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\finance_predicted_LLM1.csv')
pred_sport = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\sport_predicted_LLM1.csv')
pred_election = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\election_predicted_LLM1.csv')
pred_general = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\general_predicted_LLM1.csv')

pred_finance['1'] = pred_finance['predictions']
pred_sport['1'] = pred_sport['predictions']
pred_election['1'] = pred_election['predictions']
pred_general['1'] = pred_general['predictions']

In [8]:
pred_general

Unnamed: 0,index,date,url,author,title,topic,summary,q,predictions,1
0,4127,2023-06-01 19:30:00,msn.com,More Law&Crime coverage,Woman arrested for allegedly sending her dog t...,news,Braquelle Lynn Rutherford appears in a mugshot...,"""the""",0.004418,0.004418
1,7018,2023-05-31 11:14:47,indianexpress.com,Express Web Desk,Hindu worshippers' plea on right to pray in Gy...,news,Allowing the case to continue before a local V...,"""the""",0.000118,0.000118
2,685,2023-06-04 16:15:57,yahoo.com,India McTaggart,King Charles hails ‘reassuring presence' of tr...,news,The King has hailed the 'enduring and reassuri...,"""the""",0.834189,0.834189
3,2831,2023-06-02 19:30:00,msn.com,,This is how Lady Louise earns her money,news,"Lady Louise, the daughter of Britain's Prince ...","""the""",0.931374,0.931374
4,2003,2023-06-02 14:43:34,msn.com,Craig Minami,Diego Cartaya homers again Tulsa's comeback win,news,Oklahoma City and Tulsa were able overcome lea...,"""the""",0.003672,0.003672
...,...,...,...,...,...,...,...,...,...,...
7340,693,2023-06-03 09:18:00,yahoo.com,Reuters,Transfer of holy icon shows Russian Orthodoxy'...,news,"MOSCOW, June 3 (Reuters) - President Vladimir ...","""the""",0.000574,0.000574
7341,1808,2023-05-30 00:00:00,mdpi.com,Mei Yu,A Shallow Pooled Weighted Feature Enhancement ...,science,"College of Computer and Technology, China Thre...","""the""",0.000224,0.000224
7342,1040,2023-06-04 17:03:08,nytimes.com,Isabel Kershner,"In Israel, Tough Questions Follow Fatal Attack...",news,A breached emergency gate at Israel's southern...,"""the""",0.001623,0.001623
7343,3234,2023-05-30 11:19:48,dailymail.co.uk,Allan Glen,Ex-ferryman's cottage measuring just 232sq ft ...,business,"Published: 07:19 EDT, 30 May 2023 | ...","""the""",0.008083,0.008083


In [4]:
print('Average likelikhood of general news: ' , pred_general['1'].mean().round(4))
print('Average likelikhood of financial news: ' , pred_finance['1'].mean().round(4))
print('Average likelikhood of sports news: ' , pred_sport['1'].mean().round(4))
print('Average likelikhood of election news: ' , pred_election['1'].mean().round(4))

Average likelikhood of general news:  0.1542
Average likelikhood of financial news:  0.2159
Average likelikhood of sports news:  0.186
Average likelikhood of election news:  0.201


In [5]:
t_statistic, p_value = ttest_ind(pred_finance['1'], pred_general['1'])
print(t_statistic)
print(p_value)

8.281824813837419
1.369143882490157e-16


In [6]:
t_statistic, p_value = ttest_ind(pred_sport['1'], pred_general['1'])
print(t_statistic)
print(p_value)

4.550603559929435
5.410310432396939e-06


In [7]:
t_statistic, p_value = ttest_ind(pred_election['1'], pred_general['1'])
print(t_statistic)
print(p_value)

6.673771968071049
2.6201102461941682e-11


## Analysis on older news articles

In [3]:
pred_old = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\old_news_predicted_LLM1.csv')

In [4]:
pred_old

Unnamed: 0.1,Unnamed: 0,class,title,text,predictions
0,14601,1,French Officials to Visit Baghdad To Seek Rele...,French Foreign Minister Michel Barnier says go...,0.034568
1,67655,4,US argues for federal VoIP rules,quot;We cannot avoid this question any longer...,0.001884
2,7826,4,Reciprocal Link Scams,Reciprocal Link Scams\ \Reciprocal link scams ...,0.000172
3,18894,1,S.Korea's Roh Sees Slow Progress in 6-Party Ta...,Reuters - South Korean President Roh Moo-hyun ...,0.000180
4,71083,3,"US Analyst unimpressed, Swanepoel unfazed",JOHANNESBURG (Mineweb.com) -- Harmony represen...,0.037594
...,...,...,...,...,...
119994,66167,3,Grinstein: News Gives Blues As Delta Posts \$6...,"Blame it on Charlie. And Frances, Ivan and Jea...",0.000103
119995,83541,2,"Nowitzki, Mavs Surge Past Memphis 112-88 (AP)",AP - Dirk Nowitzki hit a nifty basket that tur...,0.001876
119996,9141,2,U.S. Takes the Bronze in Women's Beach Volleyball,ATHENS (Reuters) - Americans Holly McPeak and...,0.018713
119997,68419,2,Hamm keeps gold,More than two months after he won it in Athens...,0.000112


In [5]:
print('Average likelikhood of old news: ' , pred_old['predictions'].mean().round(4))

Average likelikhood of old news:  0.1686


## Check generalizability on newer GPT models

### GPT-3.5

In [15]:
pred_GPT3 = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_3_predicted_LLM1.csv')
pred_GPT4 = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_4_predicted_LLM1.csv')

In [16]:
pred_GPT3['pred_label'] = pred_GPT3['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_GPT4['pred_label'] = pred_GPT4['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_GPT4

Unnamed: 0,index,text,label,predictions,pred_label
0,766,"After banning firecrackers on Diwali, Supreme ...",0,0.000841,0
1,1014,Please enable Javascript to watch this video\n...,0,0.000187,0
2,264,Researchers have developed an AI model that ca...,1,0.944019,1
3,781,From /tg/station 13 wiki\n\nSpace Station 13 i...,0,0.605472,1
4,478,Researchers have used AI to decode the 'langua...,1,0.274631,0
...,...,...,...,...,...
1147,722,From Minecraft Wiki\n\nLoot tables are technic...,0,0.017534,0
1148,55,Google has announced a significant update to i...,1,0.046588,0
1149,373,Netflix has announced a new feature that lets ...,1,0.996706,1
1150,259,Google has unveiled a new AI tool that can det...,1,0.363688,0


In [14]:
scores(pred_GPT3['label'], pred_GPT3['pred_label'])

Accuracy:  0.6
Precision:  0.75
Recall:  0.31
F1 Score:  0.44


In [17]:
scores(pred_GPT4['label'], pred_GPT4['pred_label'])

Accuracy:  0.73
Precision:  0.85
Recall:  0.57
F1 Score:  0.68


## Feature importance on distilBERT model

In [3]:
pred_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\test_predicted_LLM1.csv')

In [4]:
pred_test['pred_label'] = pred_test['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_test

Unnamed: 0,text,label,predictions,pred_label
0,The University of Illinois Police Department (...,1,0.996156,1
1,PURPOSE:\n\nThis study evaluated the effect of...,1,0.991755,1
2,BELLEVILLE – A young man from Belville has bee...,1,0.997448,1
3,Synopsis\n\nA library for running tasks(jobs) ...,0,0.000713,0
4,The geolocation of the depot can be found here...,0,0.000125,0
...,...,...,...,...
9995,"Originally published June 10, 2014 at 6:43 PM ...",0,0.000198,0
9996,"""You guys have this thing for them,"" Mr. Trump...",1,0.998290,1
9997,"By Dan Merica, CNN\n\nWashington (CNN) – The l...",1,0.999627,1
9998,My ex-husband had treatment resistant depressi...,0,0.429786,0


In [5]:
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(pred_test['text'])

In [6]:
freq_table = pd.DataFrame(X.toarray(), columns= count_vectorizer.get_feature_names_out())
freq_table['label'] = pred_test['label']

In [7]:
freq_0 = freq_table[freq_table['label'] == 0].sum().nlargest(1000)
freq_1 = freq_table[freq_table['label'] == 1].sum().nlargest(1000)

In [8]:
freq_0

said      6548
new       4206
like      3710
time      3653
people    3564
          ... 
35         218
apple      218
james      218
32         217
avoid      217
Length: 1000, dtype: int64

In [9]:
freq_1

said      9315
new       7314
people    6435
label     5000
just      4681
          ... 
rise       245
track      245
2008       244
ahead      244
boy        244
Length: 1000, dtype: int64

In [31]:
pred_test['sentence_length'] = pred_test['text'].apply(lambda x: len(x.split()))
average_length_per_label = pred_test.groupby('label')['sentence_length'].mean()
average_length_per_label
#do statistical test

label
0    432.1934
1    491.5080
Name: sentence_length, dtype: float64

## plots of likelihoods

In [16]:
import statsmodels