# Analysis on predicted values

In [82]:
import pandas as pd
from scipy.stats import ttest_ind, shapiro, mannwhitneyu
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import textstat
import numpy as np

In [16]:
def scores(y, y_pred):
    accu = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)

    print('Accuracy: ', round(accu, 2))
    print('Precision: ', round(prec, 2))
    print('Recall: ', round(rec, 2))
    print("F1 Score: ", round(f1, 2))

In [3]:
# pred_finance = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\finance_predicted_LR1.csv')
# pred_sport = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\sport_predicted_LR1.csv')
# pred_election = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\election_predicted_LR1.csv')
# pred_general = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\general_predicted_LR1.csv')

In [17]:
pred_finance = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\finance_predicted_LLM1.csv')
pred_sport = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\sport_predicted_LLM1.csv')
pred_election = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\election_predicted_LLM1.csv')
pred_general = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\general_predicted_LLM1.csv')

pred_finance['1'] = pred_finance['predictions']
pred_sport['1'] = pred_sport['predictions']
pred_election['1'] = pred_election['predictions']
pred_general['1'] = pred_general['predictions']

In [18]:
pred_general

Unnamed: 0,index,date,url,author,title,topic,summary,q,predictions,1
0,4127,2023-06-01 19:30:00,msn.com,More Law&Crime coverage,Woman arrested for allegedly sending her dog t...,news,Braquelle Lynn Rutherford appears in a mugshot...,"""the""",0.004418,0.004418
1,7018,2023-05-31 11:14:47,indianexpress.com,Express Web Desk,Hindu worshippers' plea on right to pray in Gy...,news,Allowing the case to continue before a local V...,"""the""",0.000118,0.000118
2,685,2023-06-04 16:15:57,yahoo.com,India McTaggart,King Charles hails ‘reassuring presence' of tr...,news,The King has hailed the 'enduring and reassuri...,"""the""",0.834189,0.834189
3,2831,2023-06-02 19:30:00,msn.com,,This is how Lady Louise earns her money,news,"Lady Louise, the daughter of Britain's Prince ...","""the""",0.931374,0.931374
4,2003,2023-06-02 14:43:34,msn.com,Craig Minami,Diego Cartaya homers again Tulsa's comeback win,news,Oklahoma City and Tulsa were able overcome lea...,"""the""",0.003672,0.003672
...,...,...,...,...,...,...,...,...,...,...
7340,693,2023-06-03 09:18:00,yahoo.com,Reuters,Transfer of holy icon shows Russian Orthodoxy'...,news,"MOSCOW, June 3 (Reuters) - President Vladimir ...","""the""",0.000574,0.000574
7341,1808,2023-05-30 00:00:00,mdpi.com,Mei Yu,A Shallow Pooled Weighted Feature Enhancement ...,science,"College of Computer and Technology, China Thre...","""the""",0.000224,0.000224
7342,1040,2023-06-04 17:03:08,nytimes.com,Isabel Kershner,"In Israel, Tough Questions Follow Fatal Attack...",news,A breached emergency gate at Israel's southern...,"""the""",0.001623,0.001623
7343,3234,2023-05-30 11:19:48,dailymail.co.uk,Allan Glen,Ex-ferryman's cottage measuring just 232sq ft ...,business,"Published: 07:19 EDT, 30 May 2023 | ...","""the""",0.008083,0.008083


In [4]:
print('Average likelikhood of general news: ' , pred_general['1'].mean().round(4))
print('Average likelikhood of financial news: ' , pred_finance['1'].mean().round(4))
print('Average likelikhood of sports news: ' , pred_sport['1'].mean().round(4))
print('Average likelikhood of election news: ' , pred_election['1'].mean().round(4))

Average likelikhood of general news:  0.1542
Average likelikhood of financial news:  0.2159
Average likelikhood of sports news:  0.186
Average likelikhood of election news:  0.201


In [5]:
t_statistic, p_value = ttest_ind(pred_finance['1'], pred_general['1'])
print(t_statistic)
print(p_value)

8.281824813837419
1.369143882490157e-16


In [6]:
t_statistic, p_value = ttest_ind(pred_sport['1'], pred_general['1'])
print(t_statistic)
print(p_value)

4.550603559929435
5.410310432396939e-06


In [7]:
t_statistic, p_value = ttest_ind(pred_election['1'], pred_general['1'])
print(t_statistic)
print(p_value)

6.673771968071049
2.6201102461941682e-11


In [19]:
pred_election['pred_label'] = pred_election['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_sport['pred_label'] = pred_sport['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_finance['pred_label'] = pred_finance['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_general['pred_label'] = pred_general['predictions'].apply(lambda x: 1 if x>=0.5 else 0)

In [31]:
print('percentage of election articles flagged: ' ,round(pred_election.loc[pred_election['pred_label']==1, 'pred_label'].count()/pred_election['pred_label'].count(), 4))
print('percentage of sport articles flagged: ' ,round(pred_sport.loc[pred_sport['pred_label']==1, 'pred_label'].count()/pred_sport['pred_label'].count(), 4))
print('percentage of finance articles flagged: ' ,round(pred_finance.loc[pred_finance['pred_label']==1, 'pred_label'].count()/pred_finance['pred_label'].count(), 4))
print('percentage of general articles flagged: ' ,round(pred_general.loc[pred_general['pred_label']==1, 'pred_label'].count()/pred_general['pred_label'].count(), 4))

percentage of election articles flagged:  0.1913
percentage of sport articles flagged:  0.1744
percentage of finance articles flagged:  0.2102
percentage of general articles flagged:  0.1457


## hypothesis testing

In [51]:
stat, p = shapiro(pred_election['predictions'])
print('election: p=', p, 'W=', round(stat,2))
stat, p = shapiro(pred_sport['predictions'])
print('sport: p=', p, 'W=', round(stat,2))
stat, p = shapiro(pred_finance['predictions'])
print('finance: p=', p, 'W=', round(stat,2))
stat, p = shapiro(pred_general['predictions'])
print('general: p=', p, 'W=', round(stat,2))

election: p= 0.0 W= 0.6
sport: p= 0.0 W= 0.59
finance: p= 0.0 W= 0.63
general: p= 0.0 W= 0.53




In [52]:
stat, p = mannwhitneyu(pred_finance['predictions'], pred_general['predictions'], alternative='greater')
print('finance: p=', p, 'stat=', round(stat,2))
stat, p = mannwhitneyu(pred_election['predictions'], pred_general['predictions'], alternative='greater')
print('election: p=', p, 'stat=', round(stat,2))
stat, p = mannwhitneyu(pred_sport['predictions'], pred_general['predictions'], alternative='greater')
print('sport: p=', p, 'stat=', round(stat,2))

finance: p= 2.1018581793459123e-40 stat= 11244693.5
election: p= 2.2854173770013735e-29 stat= 12772017.5
sport: p= 5.172408028441515e-22 stat= 12261856.5


In [54]:
stat, p = mannwhitneyu(pred_general['predictions'], pred_old['predictions'])
print('general: p=', p, 'stat=', round(stat,2))

general: p= 2.7419546059580106e-14 stat= 417422542.0


In [55]:
stat, p = ttest_ind(pred_general['predictions'], pred_old['predictions'])
print('general: p=', p, 'stat=', round(stat,2))

general: p= 0.00018285632490277213 stat= -3.74


In [93]:
stat, p = mannwhitneyu(pred_finance['predictions'], pred_old['predictions'], alternative='greater')
print('finance: p=', p, 'stat=', round(stat,2))
stat, p = mannwhitneyu(pred_election['predictions'], pred_old['predictions'], alternative='greater')
print('election: p=', p, 'stat=', round(stat,2))
stat, p = mannwhitneyu(pred_sport['predictions'], pred_old['predictions'], alternative='greater')
print('sport: p=', p, 'stat=', round(stat,2))

finance: p= 8.324127695345964e-25 stat= 174684887.0
election: p= 1.2101375090476384e-15 stat= 198524799.0
sport: p= 4.195698222821588e-09 stat= 189887886.5


In [94]:
stat, p = mannwhitneyu(pred_general['predictions'], pred_old['predictions'], alternative='greater')
print('finance: p=', p, 'stat=', round(stat,2))

finance: p= 0.9999999999999863 stat= 417422542.0


## Analysis on older news articles

In [53]:
pred_old = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\old_news_predicted_LLM1.csv')

In [100]:
pred_old

Unnamed: 0.1,Unnamed: 0,class,title,text,predictions,pred_label,readability,sentence_length
0,14601,1,French Officials to Visit Baghdad To Seek Rele...,French Foreign Minister Michel Barnier says go...,0.034568,0,45.09,26
1,67655,4,US argues for federal VoIP rules,quot;We cannot avoid this question any longer...,0.001884,0,74.69,27
2,7826,4,Reciprocal Link Scams,Reciprocal Link Scams\ \Reciprocal link scams ...,0.000172,0,71.44,51
3,18894,1,S.Korea's Roh Sees Slow Progress in 6-Party Ta...,Reuters - South Korean President Roh Moo-hyun ...,0.000180,0,27.15,28
4,71083,3,"US Analyst unimpressed, Swanepoel unfazed",JOHANNESBURG (Mineweb.com) -- Harmony represen...,0.037594,0,30.20,25
...,...,...,...,...,...,...,...,...
119994,66167,3,Grinstein: News Gives Blues As Delta Posts \$6...,"Blame it on Charlie. And Frances, Ivan and Jea...",0.000103,0,78.25,33
119995,83541,2,"Nowitzki, Mavs Surge Past Memphis 112-88 (AP)",AP - Dirk Nowitzki hit a nifty basket that tur...,0.001876,0,37.98,34
119996,9141,2,U.S. Takes the Bronze in Women's Beach Volleyball,ATHENS (Reuters) - Americans Holly McPeak and...,0.018713,0,47.46,33
119997,68419,2,Hamm keeps gold,More than two months after he won it in Athens...,0.000112,0,82.14,29


In [96]:
print('Average likelikhood of old news: ' , pred_old['predictions'].mean().round(4))

Average likelikhood of old news:  0.1686


In [97]:
pred_old['pred_label'] = pred_old['predictions'].apply(lambda x: 1 if x>=0.5 else 0)

In [99]:
pred_old['readability'] = pred_old['text'].apply(textstat.flesch_reading_ease)
pred_old['sentence_length'] = pred_old['text'].str.split().apply(len)

In [101]:
print(pred_old.loc[pred_old['pred_label']==0, 'sentence_length'].mean().round(2))
print(pred_old.loc[pred_old['pred_label']==1, 'sentence_length'].mean().round(2))
stat, p = ttest_ind(pred_old.loc[pred_old['pred_label']==0, 'sentence_length'], pred_old.loc[pred_old['pred_label']==1, 'sentence_length'])
print(p)

31.13
30.7
3.12561861050279e-08


In [102]:
print(pred_old.loc[pred_old['pred_label']==0, 'readability'].mean().round(2))
print(pred_old.loc[pred_old['pred_label']==1, 'readability'].mean().round(2))
stat, p = ttest_ind(pred_old.loc[pred_old['pred_label']==0, 'sentence_length'], pred_old.loc[pred_old['pred_label']==1, 'sentence_length'])
print(p)

51.76
52.2
3.12561861050279e-08


## Check generalizability on newer GPT models

### GPT-3.5

In [4]:
pred_GPT3_LR = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_3_predicted_LR1.csv')
pred_GPT4_LR = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_4_predicted_LR1.csv')
pred_GPT3_LLM = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_3_predicted_LLM1.csv')
pred_GPT4_LLM = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\gpt_4_predicted_LLM1.csv')

In [5]:
pred_GPT3_LR['pred_label'] = pred_GPT3_LR['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_GPT4_LR['pred_label'] = pred_GPT4_LR['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_GPT3_LLM['pred_label'] = pred_GPT3_LLM['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_GPT4_LLM['pred_label'] = pred_GPT4_LLM['predictions'].apply(lambda x: 1 if x>=0.5 else 0)

In [11]:
scores(pred_GPT3_LR['label'], pred_GPT3_LR['pred_label'])

Accuracy:  0.47
Precision:  0.3
Recall:  0.05
F1 Score:  0.08


In [12]:
scores(pred_GPT3_LLM['label'], pred_GPT3_LLM['pred_label'])

Accuracy:  0.6
Precision:  0.75
Recall:  0.31
F1 Score:  0.44


In [13]:
scores(pred_GPT4_LR['label'], pred_GPT4_LR['pred_label'])

Accuracy:  0.61
Precision:  0.74
Recall:  0.32
F1 Score:  0.45


In [14]:
scores(pred_GPT4_LLM['label'], pred_GPT4_LLM['pred_label'])

Accuracy:  0.73
Precision:  0.85
Recall:  0.57
F1 Score:  0.68


## Feature importance on distilBERT model

In [56]:
pred_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BAM\Thesis\data\predictions\test_predicted_LLM1.csv')

In [57]:
pred_test['pred_label'] = pred_test['predictions'].apply(lambda x: 1 if x>=0.5 else 0)
pred_test

Unnamed: 0,text,label,predictions,pred_label
0,The University of Illinois Police Department (...,1,0.996156,1
1,PURPOSE:\n\nThis study evaluated the effect of...,1,0.991755,1
2,BELLEVILLE – A young man from Belville has bee...,1,0.997448,1
3,Synopsis\n\nA library for running tasks(jobs) ...,0,0.000713,0
4,The geolocation of the depot can be found here...,0,0.000125,0
...,...,...,...,...
9995,"Originally published June 10, 2014 at 6:43 PM ...",0,0.000198,0
9996,"""You guys have this thing for them,"" Mr. Trump...",1,0.998290,1
9997,"By Dan Merica, CNN\n\nWashington (CNN) – The l...",1,0.999627,1
9998,My ex-husband had treatment resistant depressi...,0,0.429786,0


In [60]:
#df_finance['flesch_kincaid'] = df_finance['summary'].apply(textstat.flesch_reading_ease)
pred_test['readability'] = pred_test['text'].apply(textstat.flesch_reading_ease)
pred_test

Unnamed: 0,text,label,predictions,pred_label,readability
0,The University of Illinois Police Department (...,1,0.996156,1,34.36
1,PURPOSE:\n\nThis study evaluated the effect of...,1,0.991755,1,23.87
2,BELLEVILLE – A young man from Belville has bee...,1,0.997448,1,56.15
3,Synopsis\n\nA library for running tasks(jobs) ...,0,0.000713,0,51.65
4,The geolocation of the depot can be found here...,0,0.000125,0,73.58
...,...,...,...,...,...
9995,"Originally published June 10, 2014 at 6:43 PM ...",0,0.000198,0,70.02
9996,"""You guys have this thing for them,"" Mr. Trump...",1,0.998290,1,60.65
9997,"By Dan Merica, CNN\n\nWashington (CNN) – The l...",1,0.999627,1,55.58
9998,My ex-husband had treatment resistant depressi...,0,0.429786,0,77.27


In [76]:
print(pred_test.loc[pred_test['pred_label']==0, 'readability'].mean().round(2))
print(pred_test.loc[pred_test['pred_label']==1, 'readability'].mean().round(2))
stat, p = ttest_ind(pred_test.loc[pred_test['pred_label']==0, 'readability'], pred_test.loc[pred_test['pred_label']==1, 'readability'])
print(p)

58.92
63.6
3.1432111894513054e-18


In [68]:
pred_test['sentence_length'] = pred_test['text'].str.split().apply(len)

In [75]:
print(pred_test.loc[pred_test['pred_label']==0, 'sentence_length'].mean().round(2))
print(pred_test.loc[pred_test['pred_label']==1, 'sentence_length'].mean().round(2))
stat, p = ttest_ind(pred_test.loc[pred_test['pred_label']==0, 'sentence_length'], pred_test.loc[pred_test['pred_label']==1, 'sentence_length'])
print(p)

433.38
485.91
6.736953874211485e-22


In [78]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(pred_test['text'])

In [85]:
freq_table = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())
freq_table['label'] = pred_test['label']

In [89]:
freq_table

Unnamed: 0,00,000,0000,00000,000000,0000000000000000000000101111000111111001011111010001000100100011,00000270,00000278,00000280,00000288,...,키울게요,태어날,테라폼,포켓몬카,하루종일,할수없는,합니다,행복을,행복하자,형님된거
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.032044,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [86]:
freq_0 = freq_table[freq_table['label'] == 0].sum().nlargest(1000)
freq_1 = freq_table[freq_table['label'] == 1].sum().nlargest(1000)

In [87]:
freq_0

the        803.223874
to         405.716489
and        364.063401
of         363.782112
in         279.032995
              ...    
capital      7.443476
normal       7.435478
worked       7.428515
claims       7.420607
sold         7.419083
Length: 1000, dtype: float64

In [88]:
freq_1

label          5000.000000
the            1047.827333
to              524.192941
of              430.808823
and             389.669377
                  ...     
individuals       8.389328
association       8.370701
wall              8.354256
couldn            8.338752
message           8.333328
Length: 1000, dtype: float64

In [31]:
pred_test['sentence_length'] = pred_test['text'].apply(lambda x: len(x.split()))
average_length_per_label = pred_test.groupby('label')['sentence_length'].mean()
average_length_per_label
#do statistical test

label
0    432.1934
1    491.5080
Name: sentence_length, dtype: float64

## plots of likelihoods

In [16]:
import statsmodels