In [3]:
import pandas as pd
import pickle

input_file = 'articles_preprocessed_1mio.pkl'
dfs = []

with open(input_file, "rb") as file:
    while True:
        try:
            chunk = pickle.load(file)
            dfs.append(chunk)
        except EOFError:
            break
df = pd.concat(dfs, ignore_index=True)
df.shape

(686801, 2)

In [4]:
print(type(df.loc[0, "content"]))#Debug

<class 'list'>


In [5]:
from collections import Counter
def get_vocab(article_series, n):
    """Takes a Pandas series of lists of words and returns a dictionary
        of the n most common words and their frequencies.
    """
    all_words = [word 
                for article in article_series 
                if isinstance(article, (list,str))
                for word in (article if isinstance(article, list) else article.split())
    ]
    word_counts = Counter(all_words)
    return dict(word_counts.most_common(n))

In [6]:
vocabulary = get_vocab(df['content'], 10000)

In [7]:
import numpy as np
def one_hot_encoding_2(article_series, vocabulary, dtype = np.int8):
    """
    input: article_series: pd.Series (list of words per article)
           vocabulary: dict (words as keys)
    output: pd.DataFrame (one-hot encoded word presence matrix)
    """
    vocab_list = list(vocabulary.keys())  # Ensure consistent ordering
    vocab_index = {word: i for i, word in enumerate(vocab_list)}  # Word-to-index mapping
    
    # Create a zero matrix with shape (num_articles, vocab_size)
    encoded_matrix = np.zeros((len(article_series), len(vocab_list)), dtype=dtype)

    for i, article in enumerate(article_series):
        if isinstance(article, list):
            for word in article:
                if word in vocab_index:  # Faster lookup in dictionary
                    encoded_matrix[i, vocab_index[word]] = 1  # Set 1 for presence
    
    return pd.DataFrame(encoded_matrix, columns=vocab_list, dtype=dtype)

In [8]:
articles_encoded = one_hot_encoding_2(df['content'],vocabulary)
articles_encoded.insert(0, 'LABEL', df['LABEL'].values)

In [9]:
articles_encoded

Unnamed: 0,LABEL,num,-,said,one,new,time,year,?,would,...,append,poni,playboy,focal,tempera,whitelist,subconsci,arson,vanc,cowardli
0,1.0,0,1,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,1,1,1,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,1,1,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
686796,1.0,1,1,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
686797,1.0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
686798,1.0,0,1,0,1,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
686799,0.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
output_file = 'articles_one-hot-encoded_1mio.pkl'
articles_encoded.to_pickle(output_file)