In [29]:
import pandas as pd
import pickle

articles = 'articles_preprocessed_1mio.pkl'

#Load the preprocessed articles
dfs = []
with open(articles, "rb") as file:
    while True:
        try:
            chunk = pickle.load(file)
            dfs.append(chunk)
        except EOFError:
            break
df_articles = pd.concat(dfs, ignore_index=True)

#Load the preprocessed LIAR dataset
df_LIAR = pd.read_pickle('LIAR_preprocessed.pkl')

In [30]:
#Debug
print(df_articles.shape)
print(df_LIAR.shape)

(581918, 2)
(1267, 2)


In [31]:
print(type(df_articles.loc[0, "content"]))#Debug
print(type(df_LIAR.loc[0, "content"]))#Debug

<class 'list'>
<class 'list'>


In [32]:
from collections import Counter
def get_vocab(article_series, n):
    """Takes a Pandas series of lists of words and returns a dictionary
        of the n most common words and their frequencies.
    """
    all_words = [word 
                for article in article_series 
                if isinstance(article, (list,str))
                for word in (article if isinstance(article, list) else article.split())
    ]
    word_counts = Counter(all_words)
    return dict(word_counts.most_common(n))

Calculate the vocabulary for **the FakeNews dataset**

In [33]:
vocabulary = get_vocab(df_articles['content'], 10000)
print(len(vocabulary))

10000


In [34]:
import numpy as np
def one_hot_encoding_2(article_series, vocabulary, dtype = np.int8):
    """
    input: article_series: pd.Series (list of words per article)
           vocabulary: dict (words as keys)
    output: pd.DataFrame (one-hot encoded word presence matrix)
    """
    vocab_list = list(vocabulary.keys())  # Ensure consistent ordering
    vocab_index = {word: i for i, word in enumerate(vocab_list)}  # Word-to-index mapping
    
    # Create a zero matrix with shape (num_articles, vocab_size)
    encoded_matrix = np.zeros((len(article_series), len(vocab_list)), dtype=dtype)

    for i, article in enumerate(article_series):
        if isinstance(article, list):
            for word in article:
                if word in vocab_index:  # Faster lookup in dictionary
                    encoded_matrix[i, vocab_index[word]] = 1  # Set 1 for presence
    
    return pd.DataFrame(encoded_matrix, columns=vocab_list, dtype=dtype)

In [35]:
LIAR_encoded = one_hot_encoding_2(df_LIAR['content'],vocabulary)
LIAR_encoded.insert(0, 'LABELS', df_LIAR['newlabels'].values)

In [36]:
LIAR_encoded

Unnamed: 0,LABELS,num,-,said,one,new,time,year,would,?,...,keynesian,shackl,elliot,insomniac,contriv,uzbekistan,backlog,baucu,gaga,minum
0,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1263,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1264,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1265,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
print(LIAR_encoded['LABELS'].isna().sum())

0


In [38]:
output_file = 'LIAR_onehot.pkl'
LIAR_encoded.to_pickle(output_file)