In [3]:
import pandas as pd
from collections import Counter
import numpy as np

**Vocabulary**

In [None]:
def get_vocab(article_series, n):
    """Takes a Pandas series of lists of words and returns a dictionary
        of the n most common words and their frequencies.

    Input: article series: pd.Series list strings; n:int
    Output: dict(string, int)
    """
    all_words = [word 
                for article in article_series 
                if isinstance(article, (list,str))
                for word in (article if isinstance(article, list) else article.split())
    ]
    word_counts = Counter(all_words)
    return dict(word_counts.most_common(n))

In [9]:
def get_vocab_2(article_series, n):
    """Takes a Pandas series of lists of words and returns a dictionary
       of the n most common words and their frequencies.
    """
    word_counts = Counter(
        word 
        for article in article_series 
        if isinstance(article, (list, str)) 
        for word in (article if isinstance(article, list) else article.split())
    )
    return dict(word_counts.most_common(n))

**One-hot encoding**

In [None]:
def one_hot_encoding(article_series, vocabulary):
    """
    input: 
        arcle_series: pd.Series list string
        vocabulary: dict(keys: strings, values: ints)
    output: pd.DataFrame
    """
    features = []
    for article in article_series:
        article_features = {word: 0 for word in vocabulary}
        if isinstance(article, list):
            for word in article:
                if word in vocabulary:
                    article_features[word] += 1
        features.append(list(article_features.values()))
    return pd.DataFrame(features, columns=vocabulary.keys())

In [None]:
def one_hot_encoding_2(article_series, vocabulary, dtype = np.int8):
    """
    input: 
        article_series: pd.Series (list of words per article)
        vocabulary: dict (words as keys)
        dtype: specifies the data type of values in the outputted dataframe
           
    output: 
        pd.DataFrame (one-hot encoded word presence matrix)
    """
    vocab_list = list(vocabulary.keys())  # Ensure consistent ordering
    vocab_index = {word: i for i, word in enumerate(vocab_list)}  # Word-to-index mapping
    
    # Create a zero matrix with shape (num_articles, vocab_size)
    encoded_matrix = np.zeros((len(article_series), len(vocab_list)), dtype=dtype)

    for i, article in enumerate(article_series):
        if isinstance(article, list):
            for word in article:
                if word in vocab_index:  # Faster lookup in dictionary
                    encoded_matrix[i, vocab_index[word]] = 1  # Set 1 for presence
    
    return pd.DataFrame(encoded_matrix, columns=vocab_list, dtype=dtype)