In [9]:
import pandas as pd
import pickle

input_file = 'articles_preprocessed_1mio.pkl'
dfs = []

with open(input_file, "rb") as file:
    while True:
        try:
            chunk = pickle.load(file)
            dfs.append(chunk)
        except EOFError:
            break
df = pd.concat(dfs, ignore_index=True)
df.shape

(581918, 2)

In [2]:
print(type(df.loc[0, "content"]))#Debug

<class 'list'>


In [3]:
df.columns #debug

Index(['content', 'LABEL'], dtype='object')

In [10]:
from collections import Counter
def get_vocab(article_series,n):
    """Takes a Pandas series of lists of words and returns a dictionary
        of the n most common words and their frequencies.
    """
    all_words = [word 
                for article in article_series 
                if isinstance(article, (list,str))
                for word in (article if isinstance(article, list) else article.split())
    ]
    word_counts = Counter(all_words)
    return set(word for word, _ in word_counts.most_common(n))

In [11]:
vocabulary = get_vocab(df['content'], 10000)

In [5]:
import numpy as np
import scipy.sparse as sp
from sklearn.preprocessing import normalize

def compute_tfidf(docs,vocabulary):
    """Efficient TF-IDF computation using sparse matrices."""
    
    # Step 1: Build Vocabulary (Unique words)
    vocab = {word: idx for idx, word in enumerate(set(vocabulary))}
    num_words = len(vocab)
    num_docs = len(docs)

    # Step 2: Compute Term Frequency (TF) as a Sparse Matrix
    row, col, data = [], [], []
    for doc_idx, doc in enumerate(docs):
        word_counts = Counter(doc)
        for word, count in word_counts.items():
            if word in vocab:
                row.append(doc_idx)
                col.append(vocab[word])
                data.append(count / len(doc))  # TF normalization
    
    tf_matrix = sp.csr_matrix((data, (row, col)), shape=(num_docs, num_words))

    # Step 3: Compute Inverse Document Frequency (IDF)
    df = np.bincount(col)  # Document frequency of each term
    idf = np.log((num_docs + 1) / (df + 1)) + 1  # Smoothed IDF
    
    # Step 4: Compute TF-IDF
    tfidf_matrix = tf_matrix.multiply(idf)  # Element-wise multiplication
    tfidf_matrix = normalize(tfidf_matrix, norm="l2", axis=1)  # Normalize per document

    return tfidf_matrix, vocab

In [12]:
tfidf_matrix, vocab = compute_tfidf(df['content'], vocabulary)

In [23]:
df_tfidf = pd.DataFrame.sparse.from_spmatrix(tfidf_matrix, columns=vocab.keys())
df_tfidf['LABEL'] = df['LABEL'].values

In [None]:
df_tfidf

Unnamed: 0,saint,titl,spice,ancestri,valeri,confer,entertain,confirm,td,endeavor,...,hilton,obedi,bar,drank,laptop,cheney,burglari,ampl,annual,LABEL
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
2,0,0.002015,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,0,0.011571,0,0,0.011237,0,0,0,0,0,...,0,0,0.012453,0,0,0,0,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581913,0,0,0.100774,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
581914,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
581915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
581916,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [13]:
import pickle
output_file = 'articles_tf_idf_weighted_1mio.pkl'

with open(output_file, "wb") as f:
    pickle.dump(tfidf_matrix,f)