In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import random
import numpy as np
import torch
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

from feature_lists import *

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
reddit_chat_level_data = pd.read_csv('output/chat/combined_reddit_output_chat_level.csv')
reddit_conv_level_data = pd.read_csv('output/conv/combined_reddit_output_conv_level.csv')

In [4]:
compute_weights(reddit_chat_level_data)
reddit_combined = aggregate_chat_level_features(reddit_chat_level_data,reddit_conv_level_data)

In [5]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

documents = reddit_combined['message'].to_list()

def preprocess_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token.lower() not in ENGLISH_STOP_WORDS]
    return ' '.join(tokens)

processed_documents = [preprocess_text(doc) for doc in documents]

umap_model = UMAP(random_state=SEED)
hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric='euclidean',
    cluster_selection_method='eom',
    prediction_data=True)

topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)
initial_topics, probabilities = topic_model.fit_transform(processed_documents)
topic_model = topic_model.reduce_topics(processed_documents, nr_topics=31)
reduced_topics, reduced_probabilities = topic_model.transform(processed_documents)
topic_info = topic_model.get_topic_info()
document_topic_mapping = pd.DataFrame({
    'Document': documents,
    'Assigned Topic': reduced_topics
})
reddit_combined['Assigned Topic'] = reduced_topics

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [6]:
for i in range(-1, 30):
    reddit_combined[str(i)] = 0  

reddit_combined['Assigned Topic String'] = reddit_combined['Assigned Topic'].astype(str)

# Update columns based on 'Assigned Topic'
for index, row in reddit_combined.iterrows():
    topic = row['Assigned Topic String']
    if topic in reddit_combined.columns:
        reddit_combined.at[index, topic] = 1  # Set the corresponding column to 1

In [7]:
# Define the old and new column headers
old_headers = [str(i) for i in range(-1, 30)]
new_headers = topic_info['Name'].tolist()

### NOTE: these are from an old run that we couldn't reproduce (no seeds), so we are not using the
# new_headers = [
#     'Residual Topic', 'Trump', 'Gender', 'Government, War and Taxes', 'Abortion', 
#     'Sexual Violence', 'Veganism', 'Education', 'Gun Violence', 'Drugs and Alcohol', 
#     'Art and Movies', 'Depression, Suffering, Suicide', 'The Universe and Space', 
#     'Gaming', 'Copyright and Piracy', 'Gender, Sports and Military', 'Food', 
#     'Tipping Culture', 'Restrooms', 'Marriage, Divorce and Relationships', 
#     'Intelligence', 'Israel-Palestine', 'Sports', 'Bullying', 'Superheros', 
#     'Aircrafts', 'Metrics versus imperial systems', 'Cutlery and Dining', 
#     'Coding and Code Editors', 'Tattoos', 'Clocks and Batteries'
# ]

# Create a mapping dictionary from old to new headers
header_mapping = dict(zip(old_headers, new_headers))

# Rename the columns and create a new DataFrame
reddit_combined = reddit_combined.rename(columns=header_mapping)

In [9]:
reddit_combined

Unnamed: 0,conversation_num,positive_bert,negative_bert,neutral_bert,num_words,num_chars,num_messages,info_exchange_zscore_chats,discrepancies_lexical_wordcount,hear_lexical_wordcount,...,21_coffee_starbucks_mcdonalds_taste,22_marriage_married_divorce_spouse,23_batman_superman_joker_hes,24_flight_plane_seat_airline,25_patriotism_nationalism_country_nation,26_celsius_fahrenheit_metric_scale,27_vim_editor_tools_text,28_santa_christmas_holiday_holidays,29_tattoo_tattoos_meaning_people,Assigned Topic String
0,1000_A,0.031993,0.453143,0.514863,26.200000,150.600000,1.0,-0.767499,0.600000,0.000000,...,0,0,0,0,0,0,0,0,0,-1
1,1000_B,0.140667,0.266719,0.592614,68.200000,392.800000,1.0,0.286746,1.800000,0.800000,...,0,0,0,0,0,0,0,0,0,-1
2,1001_B,0.122814,0.364444,0.512742,51.500000,292.500000,1.0,-0.069496,1.000000,0.500000,...,0,0,0,0,0,0,0,0,0,0
3,1002_A,0.121726,0.079516,0.798758,103.000000,529.000000,1.0,1.248646,0.000000,0.000000,...,0,0,0,0,0,0,0,0,0,0
4,1002_B,0.065243,0.539279,0.395478,58.307692,308.769231,1.0,0.005015,1.692308,0.076923,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12625,e8qm4aj,0.022125,0.599268,0.378607,73.000000,414.000000,1.0,0.470093,0.500000,0.000000,...,0,0,0,0,0,0,0,0,0,-1
12626,e8qphy5,0.023139,0.321441,0.188753,16.266667,102.133333,1.0,-1.011284,0.400000,0.133333,...,0,0,0,0,0,0,0,0,0,1
12627,e8qq561,0.113703,0.453994,0.432303,67.615385,376.538462,1.0,0.245378,1.000000,0.538462,...,0,0,0,0,0,0,0,0,0,1
12628,e8qzjei,0.054249,0.543698,0.402052,44.612903,249.064516,1.0,-0.314611,1.516129,0.322581,...,0,0,0,0,0,0,0,0,0,-1
