### Toy problem to count all words in the dataset in order to test reading the dataset

In [35]:
from datasets import load_dataset
import nltk
import itertools
from collections import Counter
from tqdm import tqdm
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import re
import os
import pandas as pd
import numpy as np

In [4]:
path_of_the_directory= 'labeled/'
print("Files and directories in a specified path:")
for filename in os.listdir(path_of_the_directory):
    f = os.path.join(path_of_the_directory,filename)
    if os.path.isfile(f):
        print(f)

Files and directories in a specified path:
labeled/7848-10000.csv
labeled/0-2691.csv


In [8]:
dfs = []

for filename in os.listdir(path_of_the_directory):
    f = os.path.join(path_of_the_directory,filename)
    if os.path.isfile(f) and f.endswith('.csv'):
        dfs.append(pd.read_csv(f))

concat_df = pd.concat(dfs, axis=0)

In [12]:
# methoden om de data te pre-processen
en_stemmer = EnglishStemmer() # stemmer voor engelse woorden
nltk.download('stopwords') # stopwoorden die niet veel waarde toevoegen
stop_words = set(stopwords.words('english'))
alph_string_pattern = re.compile("[a-zA-Z]") # filtert 'woorden' die niet beginnen met een letter, zoals interpunctietokens


def word_counter_text(text: str, stem=False, remove_stopwords=False):
    """
    Neemt als input een string tekst
    Returnt een Counter object die alle woorden uit de tekst telt
    """
    
    # splits de tekst op in een lijst van woorden
    sents = nltk.tokenize.sent_tokenize(text)
    words = [nltk.word_tokenize(sent) for sent in sents]
    flatten_words = list(itertools.chain(*words))
    
    # woorden stemmen of alleen maar hoofdletters weghalen
    if stem:
        flatten_lower_words = [en_stemmer.stem(str) for str in flatten_words]
    else:
        flatten_lower_words = [str.lower() for str in flatten_words]
    
    # stopwoorden weghalen
    if remove_stopwords:
        flatten_lower_words = [str for str in flatten_lower_words if str not in stop_words]
        
    # tokens die niet beginnen met een letter weghalen
    flatten_lower_words = [str for str in flatten_lower_words if alph_string_pattern.match(str)]
    
    return Counter(flatten_lower_words)

[nltk_data] Downloading package stopwords to /home/erik/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
texts = concat_df[concat_df['label'] == 0]['article'].tolist()

words_0 = Counter() # Counter object dat alle woorden telt
for i in tqdm(range(len(texts))): # kijkt alleen naar de eerste 2000 artikelen, anders duurt het een half uur
    words_0 += word_counter_text(texts[i], stem=True, remove_stopwords=True) # tel de Counters bij elkaar op voor elk artikel

texts = concat_df[concat_df['label'] == 1]['article'].tolist()

words_1 = Counter() # Counter object dat alle woorden telt
for i in tqdm(range(len(texts))): # kijkt alleen naar de eerste 2000 artikelen, anders duurt het een half uur
    words_1 += word_counter_text(texts[i], stem=True, remove_stopwords=True) # tel de Counters bij elkaar op voor elk artikel

100%|██████████| 4605/4605 [00:53<00:00, 85.44it/s] 
100%|██████████| 238/238 [00:02<00:00, 112.15it/s]


In [19]:
print(words_0.most_common(50))
print(len(words_0.items()), 'verschillende woorden')

[('said', 30332), ('cnn', 9219), ("n't", 7540), ('say', 7076), ('peopl', 6867), ('one', 6805), ('year', 6688), ('would', 5891), ('report', 5419), ('state', 5205), ('time', 5110), ('also', 4933), ('new', 4878), ('two', 4509), ('like', 4285), ('go', 4173), ('presid', 4148), ('told', 4126), ('u.s.', 4102), ('obama', 4018), ('get', 3752), ('day', 3656), ('polic', 3655), ('first', 3628), ('govern', 3621), ('could', 3531), ('work', 3482), ('make', 3425), ('becaus', 3361), ('last', 3349), ('nation', 3325), ('call', 3305), ('countri', 3212), ('accord', 3175), ('offici', 3117), ('want', 3081), ('use', 3063), ('unit', 3040), ('famili', 2995), ('world', 2960), ('think', 2921), ('take', 2905), ('befor', 2878), ('home', 2714), ('includ', 2680), ('watch', 2669), ('help', 2665), ('know', 2660), ('offic', 2643), ('week', 2609)]
46352 verschillende woorden


In [20]:
print(words_1.most_common(50))
print(len(words_1.items()), 'verschillende woorden')

[('said', 2324), ('peopl', 621), ('cnn', 568), ('report', 457), ('one', 405), ('fire', 350), ('offici', 329), ('two', 324), ('home', 299), ('told', 265), ('crash', 249), ("n't", 245), ('hospit', 241), ('water', 239), ('state', 236), ('area', 234), ('also', 226), ('rescu', 221), ('kill', 220), ('accord', 213), ('citi', 209), ('haiti', 203), ('nation', 200), ('plane', 197), ('sunday', 194), ('watch', 191), ('say', 191), ('die', 186), ('investig', 185), ('day', 185), ('damag', 184), ('help', 183), ('famili', 183), ('could', 181), ('time', 181), ('near', 180), ('monday', 170), ('u.s.', 169), ('earthquak', 168), ('flood', 167), ('thursday', 165), ('emerg', 165), ('build', 165), ('hous', 163), ('author', 162), ('counti', 161), ('work', 161), ('mani', 160), ('year', 155), ('friday', 154)]
8246 verschillende woorden


In [26]:
all_words = list(dict((words_0 + words_1).items()).keys())

In [32]:
# bereken de IDF voor elk woord
N = len(concat_df)

doc_freq = dict()
for word in all_words:
    doc_freq[word] = 0

texts = concat_df[concat_df['label'] == 0]['article'].tolist()

for i in tqdm(range(len(texts))):
    for word in list(dict(word_counter_text(texts[i], stem=True, remove_stopwords=True).items()).keys()):
        doc_freq[word] += 1
    

100%|██████████| 4605/4605 [00:42<00:00, 107.44it/s]


In [36]:
idf = dict()

for word in all_words:
    idf[word] = np.log(N / (1 + doc_freq[word]))

idf

{'washington': 1.7156476655507262,
 'cnn': 0.11775722554139796,
 'two': 0.67859326988205,
 'men': 1.9765205054315473,
 'work': 1.013926554216132,
 'secur': 1.623578301922499,
 'contractor': 4.09084048773079,
 'compani': 1.9228455487095093,
 'former': 1.5626457509273415,
 'known': 1.8813458178027567,
 'blackwat': 5.3497954264740795,
 'charg': 1.7087826500310461,
 'murder': 2.5637112227594137,
 'kill': 1.5656057925558182,
 'afghan': 3.723115707605473,
 'feder': 2.0588011849455383,
 'prosecutor': 2.822329162267283,
 'announc': 1.8705640421994683,
 'thursday': 1.7122092667476936,
 'christoph': 4.18122454919906,
 'drotleff': 7.792142461843284,
 'justin': 4.701100008484968,
 'cannon': 5.7772394413010195,
 'count': 2.794930188079169,
 'second-degre': 5.3497954264740795,
 'one': 0.4706232719382873,
 'attempt': 2.2257081783383064,
 'connect': 2.6273564879197697,
 'may': 1.2627236235810586,
 'shoot': 2.7422864545937466,
 'kabul': 4.310902372507592,
 'indict': 3.8031584152790097,
 'return': 1.844

In [41]:
col_idf = [idf[t] for t in all_words]
total_words_0 = np.sum(list(dict(words_0.items()).values()))
col_tf_0 = [words_0[t] / total_words_0 for t in all_words]
total_words_1 = np.sum(list(dict(words_1.items()).values()))
col_tf_1 = [words_1[t] / total_words_1 for t in all_words]

In [45]:
df_words = pd.DataFrame({'idf': col_idf, 'tf-0': col_tf_0, 'tf-1': col_tf_1}, index =all_words)

In [61]:
df_words['tf-idf-0'] = df_words['tf-0'] * df_words['idf']
df_words['tf-idf-1'] = df_words['tf-1'] * df_words['idf']
df_words['class-1-predictiveness'] = df_words['tf-idf-1'] - df_words['tf-idf-0']
df_words

Unnamed: 0,idf,tf-0,tf-1,tf-idf-0,tf-idf-1,class-1-predictiveness
washington,1.715648,0.000893,0.000599,0.001532,0.001028,-0.000503
cnn,0.117757,0.005833,0.007399,0.000687,0.000871,0.000184
two,0.678593,0.002853,0.004221,0.001936,0.002864,0.000928
men,1.976521,0.000745,0.000482,0.001473,0.000953,-0.000520
work,1.013927,0.002203,0.002097,0.002234,0.002127,-0.000107
...,...,...,...,...,...,...
phonelin,8.485290,0.000000,0.000013,0.000000,0.000111,0.000111
copytak,8.485290,0.000000,0.000013,0.000000,0.000111,0.000111
heysel,8.485290,0.000000,0.000013,0.000000,0.000111,0.000111
hhour,8.485290,0.000000,0.000013,0.000000,0.000111,0.000111


In [59]:
print('hoogste gemiddelde tf-idf woorden voor class 0:')
df_words.sort_values('tf-idf-0', ascending=False).index.tolist()[:20]

hoogste gemiddelde tf-idf woorden voor class 0:


['obama',
 'said',
 'polic',
 "n't",
 'presid',
 'u.s.',
 'state',
 'peopl',
 'say',
 'govern',
 'iraq',
 'new',
 'would',
 'famili',
 'attack',
 'children',
 'like',
 'report',
 'kill',
 'go']

In [60]:
print('hoogste gemiddelde tf-idf woorden voor class 1:')
df_words.sort_values('tf-idf-1', ascending=False).index.tolist()[:20]

hoogste gemiddelde tf-idf woorden voor class 1:


['crash',
 'haiti',
 'fire',
 'rescu',
 'quak',
 'flood',
 'plane',
 'earthquak',
 'water',
 'hospit',
 'pilot',
 'damag',
 'said',
 'port-au-princ',
 'accid',
 'airport',
 'storm',
 'tornado',
 'injur',
 'peopl']

In [62]:
print('hoogste class 1 predictiveness:')
df_words.sort_values('class-1-predictiveness', ascending=False).index.tolist()[:20]

hoogste class 1 predictiveness:


['crash',
 'rescu',
 'haiti',
 'fire',
 'quak',
 'flood',
 'earthquak',
 'plane',
 'water',
 'hospit',
 'pilot',
 'damag',
 'tornado',
 'port-au-princ',
 'accid',
 'evacu',
 'injur',
 'helicopt',
 'storm',
 'airport']

In [63]:
df_words['class-1-predictiveness'].to_dict()

{'washington': -0.0005034874438340274,
 'cnn': 0.00018449546609650375,
 'two': 0.0009283484857230211,
 'men': -0.0005204120715430513,
 'work': -0.00010709852837313205,
 'secur': -0.0011640484922758238,
 'contractor': -0.00035047065395270717,
 'compani': -0.0007431864360526975,
 'former': -0.0012267213407595283,
 'known': -0.0002320230395484932,
 'blackwat': -0.000453556647176204,
 'charg': -0.0015747187602754566,
 'murder': -0.0011613729046929884,
 'kill': 0.0020453475050055112,
 'afghan': -0.0009158911552588653,
 'feder': 0.0012733739020386321,
 'prosecutor': -0.0008825216202695116,
 'announc': -0.00038976489827694286,
 'thursday': 0.0019319588352329866,
 'christoph': -1.2110122322403797e-06,
 'drotleff': -9.859989271944824e-06,
 'justin': 3.501115975433893e-05,
 'cannon': -2.342841318124343e-05,
 'count': -0.00017178233639372135,
 'second-degre': -0.0001049272840482263,
 'one': 0.00045679002967975594,
 'attempt': -0.00043706113394795167,
 'connect': -0.0001737999051829546,
 'may': -0

In [99]:
def class_likeness(text: str, weights):
    word_count = dict(word_counter_text(text, stem=True, remove_stopwords=True).items())
    likeness = 0
    for word, count in word_count.items():
        likeness += count * weights[word]
    return likeness
    

In [104]:
class_1_weights = df_words['class-1-predictiveness'].to_dict()

concat_df_0 = concat_df[concat_df['label'] == 0].reset_index()
concat_df_1 = concat_df[concat_df['label'] == 1].reset_index()

print('class 1 likeness for acticles with true class 0:')
for i in range(20):
    print(class_likeness(concat_df_0['article'][i], class_1_weights))

print('class 1 likeness for acticles with true class 1:')
for i in range(20):
    print(class_likeness(concat_df_1['article'][i], class_1_weights))



class 1 likeness for acticles with true class 0:
-0.021942137284073415
0.1442379347950424
0.007753874849290942
0.006457449059418504
0.027976440120449215
-0.061958371592799696
0.059523569504197966
-0.012799943259246792
0.027632066643324207
-0.03251375523930874
0.026666162012599914
0.010301469642284851
-0.0041774350805371275
0.00926869979634895
-0.11149620577548572
-0.058266126591349804
-0.00286657030454745
0.04200645330534746
-0.020129594449716702
-0.00868991178529353
class 1 likeness for acticles with true class 1:
0.12206136768829987
0.3024180393931243
0.3019205706150696
0.5131009565149698
0.12207319091424984
0.1691975954038938
0.1938306404126277
0.15089999520370712
0.0997342445896203
0.23054898478575686
0.12546141103891803
0.05352791037237364
0.04413929151926867
0.60413847456071
0.04393609393914465
0.3849430050512924
0.3952393102545223
0.08119396929103882
0.4983250545516289
0.18660015828096801
