In [6]:
%pip install bertopic

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [8]:
from bertopic import BERTopic

import pandas as pd
import numpy as np
import json
import os

In [4]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

dir = '/kaggle/input/wikipedia-20230701'

file_paths = sorted(os.listdir(dir))
file_paths.remove('wiki_2023_index.parquet')
file_paths = [os.path.join(dir, path) for path in file_paths]

fraction = 0.01
big_data = dd.read_parquet(file_paths[0]).sample(frac=fraction, random_state=42)

for file in file_paths[1:]:
    curr = dd.read_parquet(file).sample(frac=fraction, random_state=42)
    big_data = dd.concat([big_data, curr], ignore_index=True)
    del curr

In [None]:
# Attempted to topic model by first premodelling topics from subset of wikipedia
# however was not able to sample a particularly large section of wikipedia to to memory constraints
# ended up with more topic outliers
big_data_pd = big_data.compute()
fit_docs = big_data_pd['text'].tolist()
model = BERTopic()
_ = model.fit_transform(fit_docs)

In [9]:
# Direct generated topic modelling from just articles

file_path = '/kaggle/input/wikinews-data-converter-2-final-stage-3/enwikinews-processed.parquet'

w_data = pd.read_parquet(file_path)
w_data.drop(columns=['page_namespace'], inplace=True)
w_data.dropna(inplace=True)

docs = w_data['page_text_extract_result'].tolist()

In [10]:
model2 = BERTopic()
topics, probs = model2.fit_transform(docs)

In [17]:
pd.value_counts(topics) # can see number of articles not assigned topic

-1      6943
 0       558
 1       326
 2       197
 3       158
        ... 
 366      11
 367      10
 368      10
 369      10
 370      10
Name: count, Length: 372, dtype: int64

In [13]:
# add back to dataframe

w_data['assigned_topic_num'] = topics
w_data['topic_probability'] = probs

In [14]:
topical = w_data[w_data['assigned_topic_num'] != -1]
topical.drop(columns=['revision_id', 'page_id', 'page_text'], axis=1, inplace=True)

In [15]:
import re

def clean_dates(text):
    pattern = r'\b([jJ]anuary|[fF]ebruary|[mM]arch|[aA]pril|[mM]ay|[jJ]une|[jJ]uly|[aA]ugust|[sS]eptember|[oO]ctober|[nN]ovember|[dD]ecember) (\d{1,2}), (\d{4})\b'
    match = re.findall(pattern, text[0].lower())
    if match:
        return {
            "Year": match[0][0],
            "Month": match[0][1],
            "Day": match[0][2],
            "Hour": 0,
            "Minute": 0,
            "Second": 0
            }
    else:
        return np.NaN
topical['article_date'] = topical['page_dates'].apply(clean_dates)
topical.dropna(inplace=True)

In [16]:
from dateutil.parser import parse

def parsedatetodict(timestamp):
    try:
        dt = parse(timestamp)
        return {
            "Year": dt.year,
            "Month": dt.month,
            "Day": dt.day,
            "Hour": dt.hour,
            "Minute": dt.minute,
            "Second": dt.second
            }
    except Exception as e:
        return np.NaN


topical['page_dates_parsed'] = topical['page_dates_parsed'].apply(lambda x: parsedatetodict(x[0]))
topical['last_update_timestamp'] = topical['last_update_timestamp'].apply(lambda x: parsedatetodict(x[0]))
topical.dropna(inplace=True)

In [137]:
topical.drop(columns=['page_dates', 'page_text_extract_result'], inplace=True)

topical.reset_index(drop=True, inplace=True)

In [18]:
topical.to_json('topical_output.json', orient='records')
topical.to_parquet('topical_output.parquet', index=False)

In [19]:
topics = model2.get_topics()

# select word with biggest confidence in set of words assigned to topic
for key in topics.keys():
    max_pair = max(topics[key], key=lambda d: d[1])
    topics[key] = max_pair
    
import pickle
with open("topics.json", "w") as file:
    file.write(json.dumps(topics))
with open("topics.pkl", "wb") as file:
    pickle.dump(topics, file)