In [1]:
import logging
import os
import operator

import joblib
import matplotlib.pyplot as plt
import pandas as pd
# Plotly graphs have more features than seaborn, like interactive hover text & zoom, but they don't show up in pdfs
import plotly.express as px
import pyspark.sql.functions as fn
from pyspark.ml.linalg import Vectors, VectorUDT
import seaborn as sns
import numpy as np

from pyspark.ml.stat import Summarizer

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import ihop

import ihop.community2vec as ic2v
import ihop.import_data as iid
import ihop.text_processing as itp
import ihop.clustering as ic


In [2]:
C2V_MODEL_PATH = "../data/community2vec/RC_2021-05/best_model/keyedVectors"

# This data was produced by the bagOfWords_preprocessing_databricks.ipynb notbook, it removes deleted comments/submissions and comments from top most commenting users, joins comments and submissions, but has no text preprocessing
# These are essentially the same steps as ihop.import_data bow 
REDDIT_THREADS_PATH = "../data/bagOfWords/2021-05_to_2021-06_joined_submissions_comments_5percentTopUsersExcludedFromComments_02102022.parquet"

In [3]:
spark = ihop.utils.get_spark_session("Cluster Labels Notebook",config={"spark.driver.memory":"36G", "spark.master":"local[*]"})#, "spark.driver.extraJavaOptions":"-XX:+UseG1GC"})

22/06/27 11:39:37 WARN Utils: Your hostname, virginia-beastbox resolves to a loopback address: 127.0.1.1; using 10.3.40.174 instead (on interface wlp147s0)
22/06/27 11:39:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/27 11:39:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/06/27 11:39:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
2022-06-27 11:39:38,811 : INFO : Spark configuration: [('spark.app.name', 'Cluster Labels Notebook'), ('spark.driver.port', '44927'), ('spark.app.id', 'local-1656344377966'), ('spark.driver.memory', '36G'), ('spark.app.startTime', '1656344377333'), ('spark.executor.id', 'driver'), ('spark.driver.host', 

In [4]:
dataframe = spark.read.parquet(REDDIT_THREADS_PATH)#.limit(10000)

In [5]:
dataframe.columns

['subreddit',
 'author',
 'created_utc',
 'id',
 'score',
 'selftext',
 'title',
 'url',
 'fullname_id',
 'comments_subreddit',
 'comments_id',
 'parent_id',
 'comments_score',
 'link_id',
 'comments_author',
 'body',
 'comments_created_utc',
 'time_to_comment_in_seconds']

In [6]:
print(dataframe.head(1))

[Row(subreddit='superleague', author='SL_Thread', created_utc='1619960426', id='n135uh', score=7, selftext="||SUNDAY||\n|:--|:--:|--:|\n||15:00\n|[](#game-hudds)|[](#vs)|[](#game-leeds)\n||JOHN SMITH'S STADIUM\n||[OurLeague Stream](/r/OL)\n___\n\n**Remember to comment your thoughts below and upvote the thread!**", title='Sunday Match Thread | Round Five', url='https://www.reddit.com/r/superleague/comments/n135uh/sunday_match_thread_round_five/', fullname_id='t3_n135uh', comments_subreddit='superleague', comments_id='gwoa7gy', parent_id='t3_n135uh', comments_score=1, link_id='t3_n135uh', comments_author='longaltifrog', body='Cracking game , nerves were going at the end', comments_created_utc=1619970474, time_to_comment_in_seconds=10048.0)]


                                                                                

In [7]:
# reproduce some of the work from ihop.text_processing.py to concat submission and comments within a time range of 3s-3d, but don't use TF-IDF 
corpus = itp.SparkCorpus.init_from_joined_dataframe(dataframe, max_time_delta=60*60*72, min_time_delta=3)
corpus.document_dataframe.columns

['id', 'subreddit', 'document_text']

In [8]:
text_pipeline = itp.SparkTextPreprocessingPipeline(input_col = "document_text", maxDF=0.80, minDF=0.02)
subreddit_term_freq_df = text_pipeline.fit_transform(corpus.document_dataframe).drop("tokenized", "document_text", "tokensNoStopWords").groupBy("subreddit").agg(
    Summarizer.sum(fn.col("vectorized")).alias("sum_vectorized")
)

2022-06-27 11:39:42,081 : INFO : Parameters for SparkTextPreprocessingPipeline: {'self': <ihop.text_processing.SparkTextPreprocessingPipeline object at 0x7fcb0cf3d8b0>, 'input_col': 'document_text', 'output_col': 'vectorized', 'tokens_col': 'tokenized', 'filtered_tokens_col': 'tokensNoStopWords', 'tokenization_pattern': '([\\p{L}\\p{N}#@][\\p{L}\\p{N}\\p{Pd}\\p{Pc}\\p{S}\\p{P}]*[\\p{L}\\p{N}])|[\\p{L}\\p{N}]|[^\\p{P}\\s]', 'match_gaps': False, 'toLowercase': True, 'stopLanguage': 'english', 'stopCaseSensitive': False, 'is_skip_vectorization': False, 'maxDF': 0.8, 'minDF': 0.02, 'minTF': 0.0, 'binary': False, 'useIDF': False}
2022-06-27 11:39:42,097 : INFO : Using RegexTokenizer with following parameters: {inputCol: document_text, outputCol: tokenized, pattern: ([\p{L}\p{N}#@][\p{L}\p{N}\p{Pd}\p{Pc}\p{S}\p{P}]*[\p{L}\p{N}])|[\p{L}\p{N}]|[^\p{P}\s], toLowercase: True, gaps: False}
2022-06-27 11:39:42,137 : INFO : Using StopWordsRemover with the following parameters: {inputCol: tokenized,

In [9]:
# Save vocabulary
id_to_term = text_pipeline.get_id_to_word()
print(len(id_to_term))
print(list(id_to_term.items())[0:20])
with open("id_to_term.joblib", 'wb') as f:
    joblib.dump(id_to_term, f)

699
[(0, 'like'), (1, 'get'), (2, 'one'), (3, 'people'), (4, 'think'), (5, 'time'), (6, 'know'), (7, 'good'), (8, 'really'), (9, 'also'), (10, 'even'), (11, 'much'), (12, 'want'), (13, 'see'), (14, 'it’s'), (15, 'i’m'), (16, 'make'), (17, 'still'), (18, 'got'), (19, 'go')]


In [10]:
pandas_df = subreddit_term_freq_df.toPandas()

                                                                                

In [11]:
pandas_df["numpy_vectorized"] = pandas_df["sum_vectorized"].apply(lambda x: x.toArray())
pandas_df = pandas_df.drop("sum_vectorized", axis=1)

pandas_df.dtypes


subreddit           object
numpy_vectorized    object
dtype: object

In [12]:
total_corpus_counts = np.sum(pandas_df["numpy_vectorized"])
print("Array transform to numpy:", total_corpus_counts)
print("Vocab size check:", len(total_corpus_counts))
total_tokens = np.sum(total_corpus_counts)
print("Total tokens in corpus:", total_tokens)

Array transform to numpy: [17171465. 11455174. 10660641.  8591402.  8150215.  7856228.  7642621.
  7435428.  7007795.  6909148.  6223672.  5817422.  5642260.  5513149.
  5508071.  5328943.  5257921.  5239334.  5174020.  5099136.  5021278.
  4522704.  4478883.  4441071.  4436014.  4301082.  4231464.  4169873.
  4090440.  3912745.  3906303.  3829548.  3822345.  3771797.  3743472.
  3738377.  3619695.  3588361.  3574734.  3536373.  3518774.  3506830.
  3484572.  3410769.  3347400.  3240375.  3108731.  3102848.  3054888.
  3038292.  3008993.  3000398.  2998319.  2959405.  2899682.  2895671.
  2890123.  2866146.  2852069.  2851592.  2817379.  2783793.  2776669.
  2759524.  2724603.  2712428.  2707624.  2704421.  2672268.  2668705.
  2615804.  2558879.  2556918.  2542517.  2535886.  2520314.  2489178.
  2482391.  2440037.  2424375.  2421008.  2407372.  2398398.  2360801.
  2339463.  2289051.  2283343.  2281059.  2280545.  2277136.  2218185.
  2137211.  2126091.  2123373.  2114481.  2086930. 

In [13]:
def compute_token_probabilities(token_count_pdf, vectorized_col):
    token_count_array = np.sum(token_count_pdf[vectorized_col])
    total_tokens = np.sum(token_count_array)
    return token_count_array/total_tokens

def compute_pmi(token_count_pdf, vectorized_col, total_term_probabilities):
    """Returns numpy array storing pointwise mutual information between given dataframe values and the overall corpus.

    :param token_count_pdf: _description_
    :param vectorized_col: 
    :param total_term_probabilities: _description_
    """
    # Compute P(t|class), the conditional proability of the 
    conditional_probs = compute_token_probabilities(token_count_pdf, vectorized_col)
    pmis = np.log2(conditional_probs / total_term_probabilities)
    return pmis

def compute_differential_cluster_label_scheme(token_count_pdf, vectorized_col, total_term_probabilities):
    """Returns Popescul and Ungars method for cluster labeling given dataframe values and the overall corpus.

    :param token_count_pdf: _description_
    :param vectorized_col: 
    :param total_term_probabilities: _description_
    """
    # Compute P(t|class), the conditional proability of the 
    conditional_t_given_class = compute_token_probabilities(token_count_pdf, vectorized_col)
    score_t_given_class = np.square(conditional_t_given_class) / total_term_probabilities
    return score_t_given_class

In [14]:
# Computes P(t), the probability of the term in the corpus overall
corpus_term_probabilities = compute_token_probabilities(pandas_df, "numpy_vectorized")

In [15]:
selected_subreddits = ["4chan", "Utah", "MensRights", "conservatives", "libertarianmemes"]
k = 10
for s in selected_subreddits:
    for (score_name, score_func) in [("PMI", compute_pmi), ("Popescul Ungars", compute_differential_cluster_label_scheme)]:
 
        selected_pdf = pandas_df[pandas_df["subreddit"]==s]
        display(selected_pdf)

        pmi_values = score_func(selected_pdf, "numpy_vectorized", corpus_term_probabilities)

        top_pmi_indices = np.argpartition(pmi_values, -k)[-k:]
        print("Top", score_name, "values for subreddit:", s)
        print(f"\tTerm\t{score_name}")
        for i in top_pmi_indices:
            print(f"\t{id_to_term[i]}\t{pmi_values[i]}")
        print()
        bottom_pmi_indices = np.argpartition(pmi_values,k)[:k]
        print("Bottom", score_name, "values for subreddit:", s)
        print(f"\tTerm\t{score_name}")
        for i in bottom_pmi_indices:
            print(f"\t{id_to_term[i]}\t{pmi_values[i]}")



Unnamed: 0,subreddit,numpy_vectorized
631,4chan,"[3900.000000000002, 2175.0000000000005, 2094.0..."


Top PMI values for subreddit: 4chan
	Term	PMI
	funny	1.6326218445828053
	country	1.7575841800248684
	gt	2.0065807731614864
	lmao	1.8857887970134803
	word	1.6571239294465787
	white	2.074934736050337
	fucking	2.4702749726810627
	fuck	2.117571886355602
	based	2.669335834035875
	shit	2.1022675090851637

Bottom PMI values for subreddit: 4chan
	Term	PMI
	players	-3.930038806298334
	player	-2.871965964359553
	😍	-4.081181041706523
	update	-2.4540154769062417
	wondering	-2.016417336036202
	hi	-2.837538985167364
	team	-2.240384651770118
	hoping	-2.0931878082689606
	season	-2.3715517150795935
	damage	-2.288096331473424


Unnamed: 0,subreddit,numpy_vectorized
631,4chan,"[3900.000000000002, 2175.0000000000005, 2094.0..."


Top Popescul Ungars values for subreddit: 4chan
	Term	Popescul Ungars
	one	0.011962367209359764
	get	0.012010578497592081
	white	0.015488217530574892
	gt	0.019373224443915812
	shit	0.03830556296804033
	fuck	0.029843137516142204
	based	0.03992174586372757
	people	0.04258978327868785
	fucking	0.039187056121834554
	like	0.025761370211717018

Bottom Popescul Ungars values for subreddit: 4chan
	Term	Popescul Ungars
	hoping	2.549705971380445e-05
	update	1.88100111808186e-05
	player	1.5643391374097758e-05
	damage	3.3998611912131183e-05
	players	4.5078608141616054e-06
	wondering	3.396705987300427e-05
	😍	2.0297481585038696e-06
	hi	1.361800188460335e-05
	season	3.6513591651925655e-05
	added	3.967909996850566e-05


Unnamed: 0,subreddit,numpy_vectorized
109,Utah,"[673.0000000000001, 540.0000000000003, 454.999..."


Top PMI values for subreddit: Utah
	Term	PMI
	lots	1.414961403592092
	local	1.5446260921305002
	moving	1.598783063882577
	area	2.399649213948196
	beautiful	1.7456348435486668
	state	3.1372640660810633
	city	2.841698681735163
	drive	2.4925070759879273
	near	1.8385111752859693
	water	2.7434189391465176

Bottom PMI values for subreddit: Utah
	Term	PMI
	videos	-inf
	players	-inf
	player	-inf
	update	-4.463517109682767
	😍	-3.5057201737618926
	game	-3.8414669655290274
	girl	-3.45728316126113
	series	-3.2755249579549517
	games	-3.201061851007929
	style	-3.198356953227137


  pmis = np.log2(conditional_probs / total_term_probabilities)


Unnamed: 0,subreddit,numpy_vectorized
109,Utah,"[673.0000000000001, 540.0000000000003, 454.999..."


Top Popescul Ungars values for subreddit: Utah
	Term	Popescul Ungars
	place	0.01031517509504337
	one	0.01128732533916695
	get	0.01479576407431094
	like	0.015331136409848302
	city	0.033213616820400735
	state	0.06306228077298107
	people	0.03391204994481017
	area	0.021341479994225122
	drive	0.01872680985526923
	water	0.047482351064936965

Bottom Popescul Ungars values for subreddit: Utah
	Term	Popescul Ungars
	character	1.7445821117245923e-05
	style	5.577372746221594e-06
	focus	1.2771177735558381e-05
	players	0.0
	player	0.0
	😍	4.507171891584577e-06
	videos	0.0
	series	7.930326080853805e-06
	girl	6.991596784306227e-06
	update	1.160241827694808e-06


Unnamed: 0,subreddit,numpy_vectorized
72,MensRights,"[4036.999999999999, 2821.000000000001, 2578.99..."


Top PMI values for subreddit: MensRights
	Term	PMI
	human	1.5442724450090743
	fact	1.549393711560967
	health	1.5698787628567672
	man	1.9367389193218343
	simply	1.740783431813583
	problems	1.8442367411991258
	sub	1.5943485622675861
	social	1.9574657618460578
	support	2.002505995476581
	issues	2.203955750760912

Bottom PMI values for subreddit: MensRights
	Term	PMI
	update	-4.398647429625132
	players	-4.704745757574912
	😍	-7.025812994425413
	player	-4.231635416357287
	awesome	-3.067739002895289
	season	-3.775615286435782
	played	-2.891951128103457
	team	-2.679488571262257
	game	-3.179356454971799
	series	-2.7367240895649037


Unnamed: 0,subreddit,numpy_vectorized
72,MensRights,"[4036.999999999999, 2821.000000000001, 2578.99..."


Top Popescul Ungars values for subreddit: MensRights
	Term	Popescul Ungars
	saying	0.011955155326173113
	support	0.016843905542828642
	say	0.013064772694893331
	like	0.016765428445961128
	get	0.012271820340296813
	think	0.012245987297997935
	even	0.0174989201468032
	people	0.030018883917453266
	man	0.03493713952005967
	issues	0.020375485604115375

Bottom Popescul Ungars values for subreddit: MensRights
	Term	Popescul Ungars
	players	1.5401053794964539e-06
	player	2.3753539479429676e-06
	😍	3.424499192543898e-08
	season	5.213436074682932e-06
	update	1.2694159682930376e-06
	awesome	1.17089714072208e-05
	store	1.5605222670340212e-05
	played	1.5631002617460586e-05
	series	1.6737149341425625e-05
	dark	1.8233094112020016e-05


Unnamed: 0,subreddit,numpy_vectorized
1206,conservatives,"[1035.9999999999998, 653.0, 654.9999999999999,..."


Top PMI values for subreddit: conservatives
	Term	PMI
	won’t	1.5002975096422868
	social	1.506064032407606
	stupid	1.6343570135500571
	history	1.6526291112332
	says	1.7984304039113945
	white	3.151878193587896
	black	2.5576171157772993
	state	2.5734764443005274
	country	2.5642408476113876
	left	1.8515002940425347

Bottom PMI values for subreddit: conservatives
	Term	PMI
	hi	-4.388907621467504
	😍	-4.13004933747748
	series	-3.899854121670539
	season	-3.557923534600486
	recommend	-3.3415389498994448
	player	-3.3358717594093537
	hello	-3.1911359807718513
	players	-3.171552180011687
	op	-2.6698150032244268
	amp;#x200b	-2.5757192237714968


Unnamed: 0,subreddit,numpy_vectorized
1206,conservatives,"[1035.9999999999998, 653.0, 654.9999999999999,..."


Top Popescul Ungars values for subreddit: conservatives
	Term	Popescul Ungars
	says	0.013111929416861534
	right	0.013291042136870785
	us	0.016008405014019886
	left	0.018457254084803615
	country	0.02641952104283382
	black	0.03509725626006695
	white	0.06892646543408866
	people	0.06303906736373362
	state	0.02886261678190653
	like	0.01528903656470373

Bottom Popescul Ungars values for subreddit: conservatives
	Term	Popescul Ungars
	hello	5.454473713112837e-06
	hi	1.5852450371166933e-06
	series	3.337393641708311e-06
	😍	1.896795498658902e-06
	recommend	6.552632093041161e-06
	season	7.049974513466076e-06
	player	8.22302843213862e-06
	players	1.2901046774814943e-05
	x	1.7137806198255207e-05
	op	1.8266817506049435e-05


Unnamed: 0,subreddit,numpy_vectorized


Top PMI values for subreddit: libertarianmemes
	Term	PMI
	gets	nan
	took	nan
	agree	nan
	pay	nan
	call	nan
	family	nan
	u	nan
	true	nan
	told	nan
	welcome	nan

Bottom PMI values for subreddit: libertarianmemes
	Term	PMI
	knew	nan
	shot	nan
	hi	nan
	anymore	nan
	version	nan
	thats	nan
	happens	nan
	space	nan
	plus	nan
	worked	nan


  return token_count_array/total_tokens


Unnamed: 0,subreddit,numpy_vectorized


Top Popescul Ungars values for subreddit: libertarianmemes
	Term	Popescul Ungars
	gets	nan
	took	nan
	agree	nan
	pay	nan
	call	nan
	family	nan
	u	nan
	true	nan
	told	nan
	welcome	nan

Bottom Popescul Ungars values for subreddit: libertarianmemes
	Term	Popescul Ungars
	knew	nan
	shot	nan
	hi	nan
	anymore	nan
	version	nan
	thats	nan
	happens	nan
	space	nan
	plus	nan
	worked	nan


  return token_count_array/total_tokens
