In [37]:
import pandas as pd
import numpy as np

from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [38]:
# Sentiment and Tokenizer functions
def sentiment_flag(text: str) -> str:
    score = analyzer.polarity_scores(text)
    if(score['compound'] >= 0.05):
        return 'pos'
    elif(score['compound'] <= -0.05):
        return 'neg'
    else:
        return 'neu'

def get_polarity(text: str) -> float:
    score = analyzer.polarity_scores(text)
    return score['compound']

def sentence_tokenizer(text: str) -> list[str]:
    tokenized = tokenize.word_tokenize(text)
    return tokenized

In [39]:

# Data Cleaning
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

import re
import string
from bs4 import BeautifulSoup
from markdown import markdown
import contractions
import unicodedata

STOPWORDS = set(stopwords.words('english')) #stopwords
ps = PorterStemmer()

In [40]:
def clean_tokenize(text: str):
    text = markdown(text)
    text = re.sub(r'https?://\S+', '', text, flags=re.MULTILINE)
    text = BeautifulSoup(text, "html.parser").getText() # HTML decoding
    
    text = text.lower()
    text = re.sub(r'0-9', '', text) # remove numbers
    
    text = text.replace('>', '')
    text = text.replace("\\_", '')
    text = text.replace('&amp', '')
    text = text.replace('&gt', '')
    text = text.replace('(<a).*(>).*(</a>)', '')
    text = text.replace('\xa0', '')
    text = text.replace('<br/>', '')
    
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = contractions.fix(text) #contractions # type: ignore
    
    return text

def clean_sentiment(text: str):
    text = markdown(text)
    text = re.sub(r'https?://\S+', '', text, flags=re.MULTILINE)
    text = BeautifulSoup(text, "html.parser").getText() # HTML decoding
    
    text = text.lower() # lowercase text
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore') #symbols
    text = re.sub(r'0-9', '', text) # remove numbers
    text = contractions.fix(text) #contractions # type: ignore
    
    text = text.translate(str.maketrans('', '', string.punctuation)) # type: ignore
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    
    return text

In [41]:
def word_count(comment: str) -> int:
    comment_list = comment.split()
    return len(comment_list)

In [42]:
import pyarrow as pa
import time
timestr = time.strftime("%Y%m%d-%H%M%S")

from pyspark.sql import SparkSession

In [43]:
url = "jdbc:postgresql://localhost:5432/reddit_comments"

properties = {
    "user": "admin",
    "password": "theLegend30$$",
    "driver": "org.postgresql.Driver"
}

spark = SparkSession\
    .builder\
    .appName("Incel Comments Extraction (01/2020 - 12/2020)")\
    .config("spark.jars", "/usr/local/postgres/postgresql-42.6.0.jar")\
    .config("spark.driver.memory", "10g")\
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
    .getOrCreate()
    
keysubreddits = spark.read \
    .format("jdbc") \
    .option("url", url)\
    .option("dbtable", "targetted_subreddits") \
    .option("user", properties["user"]) \
    .option("password", properties["password"])\
    .option("driver", properties["driver"])\
    .load()
    
keywords = spark.read\
    .format('jdbc')\
    .option("url", url)\
    .option("dbtable", "ig_array") \
    .option("user", properties["user"]) \
    .option("password", properties["password"])\
    .option("driver", properties["driver"])\
    .load()

In [44]:
keysubredditsResult = keysubreddits.select('*').collect()
keywordsResult = keywords.select('*').collect()

In [45]:
spark.stop()

In [46]:
del keywords, keysubreddits

In [47]:
# Now will need to separate keywords list and remove '%'
keysubredditsList = [word.names for word in keysubredditsResult[:]]
keywordsList = [word.replace('%', '') for word in keywordsResult[0].words]

In [48]:
del keysubredditsResult, keywordsResult

In [49]:
del spark, url, properties

In [50]:
incelDF = pd.read_csv('../data/RAW_incelDF-20230615-120758.csv')

# Drop the Unnamed and controversality columns
incelDF = incelDF.drop('controversiality', axis=1)

In [51]:
incelDF.describe()

Unnamed: 0,score
count,109437.0
mean,0.598043
std,4.414665
min,-64.0
25%,-1.0
50%,0.0
75%,2.0
max,203.0


# Things to Do
## EDA
In this section we will perform some basic data exploration. We will see

- ~~The ratio of subreddit types~~

- ~~Size of each subreddits in the search~~

- ~~How many subreddits are there~~

- ~~How many comments each user has made~~

- ~~How many comments are in each subreddit~~

- ~~What the average comment length per subreddit~~

- ~~What the average word count/length for users~~

- ~~What was the average score for each subreddit~~

- ~~What were the keywords used in the search~~

    

## Feature Engineering
Additionally, we will need to create new features here such as:

- ~~Sentiments for each observation~~

- ~~Polarity value for each comment~~

- ~~Word tokenization for each comment~~

- ~~Two sets of clean text for word tokenization and sentiment flagging~~

- ~~Incel Word Frequency per user comment~~

To begin, we imported the incelDF file and dropped the unnamed and controverislality columns from the dataframe. After we pulled data from our local PostgreSQL machine. We used these terms and subreddits to help narrow our search for incel-related langauge. The keywords were supplied by a paper from @gorja and the @incelglossary and were stored in a table. To imprve the search we made use of the PostgreSQL builtin text search engine and added the required symbols to search for these incel terms at the beginning, within or end of words. The subreddits were selected using previously done work on the psyche of incels. From the incel definition and study by [] we made assertions on the type of subreddits we may see incel-related language being used. We tried to avoid using directly incel subreddits or subreddits that act as a counter to incels (i.e IncelTears). in total we have selected 33 subreddits and 152 search words. 

The search was performed by creating a view of 12 tables, each representing a month of the year where each table containing millions of comments. We then created a Apache Spark session and connected to the PostgreSQL database and then queried the view. The size of this view was of concern so we enabled Apache Arrow to improve performance.

In [52]:
subredditsDF = pd.DataFrame(keysubredditsList, columns=["Subreddits"])
print(subredditsDF)

subredditsDF.to_csv('../data/subreddits.csv', index=False)

          Subreddits
0             dating
1            anxiety
2   datingoverthirty
3    datingoverforty
4             advice
5              nudes
6         truerateme
7      dating_advice
8         depression
9            amiugly
10      mentalhealth
11            amihot
12              self
13               sex
14     relationships
15            tinder
16            bumble
17         aspergers
18              adhd
19        ratemycock
20      suicidewatch
21            lonely
22      socialskills
23           college
24      2meirl4meirl
25              rant
26             mgtow
27   pussypassdenied
28      deadbedrooms
29              vent
30            rateme
31          askwomen
32      socialskills
33            virgin


In [53]:
keywordsDF = pd.DataFrame(keywordsList, columns=['List of Search Words'])
print(keywordsDF)

# keywordsDF.to_csv('../data/keywords.csv', index=False)

    List of Search Words
0                  alpha
1             alpha male
2                 ascend
3                  awalt
4                agecuck
..                   ...
147                 chad
148             chadlite
149             gigachad
150               tyrone
151            normalfag

[152 rows x 1 columns]


Starting off with how many subreddit types exist we counted for each row for the type. This yielded two results, either 'public' or 'restricted. A majority of the subreddits (108,959) in this sample were from public communities. Subreddits that are marked as restricted are due to the community and/or its members violation Reddit's sitewide communitiy rules.

In [54]:
del keywordsDF, subredditsDF

In [55]:
subreddit_type_size = incelDF['subreddit_type'].value_counts()
print(subreddit_type_size)

public        109378
restricted        59
Name: subreddit_type, dtype: int64


Since most of the subreddits are public we dropped the restricted ones and will use this public only dataframe for the rest of our EDA.

In [56]:
publicIncelDF = incelDF[(incelDF['subreddit_type'] == 'public')]

Next we find the subreddits that have the highest number of comments made. To do this, the grouped the public Incel DF by subreddit and get the size, which acts as a sum of how many uniques subreddits exists in the dataframe. We can see that the top 10 subreedits deal with relationships. The other top 10 subreddits deal with seeking advice, community feedback, mental health and some personal opinions.

In [75]:
# First we find the size of each subreddit
subreddit_size = publicIncelDF.groupby('subreddit').size().sort_values(ascending=False)
print(subreddit_size.head(10))
print('\n')
print(subreddit_size.tail(10))

subreddit_size.to_csv('../data/subreddit_sizes.csv', header=['size'], index=True)

subreddit
relationships       28462
sex                 11094
Tinder              10363
datingoverthirty     7723
pussypassdenied      6919
dating_advice        6400
rant                 5324
dating               4736
DeadBedrooms         4135
Advice               2940
dtype: int64


subreddit
Rateme          485
depression      481
virgin          419
Anxiety         392
ADHD            389
lonely          285
mentalhealth    195
amihot          160
Nudes            35
ratemycock        7
dtype: int64


Next, we find the amount of comments each user has made. The resutls show that the highest number of comments made by a users in this search was 228 comments. The lowest possible number of comments made by a user is one.

In [58]:
most_active_users = publicIncelDF.groupby('author').size().sort_values(ascending=False)
print(most_active_users.head(10))
print('\n')
print(most_active_users.tail(10))

author
myexsparamour       228
permanent_staff     145
alittlemouth        134
Jdamoftruth         120
indigo_tortuga      116
PSMF_Canuck         113
NamelessBard        104
AutoModerator        89
anus_dei             87
facinationstreet     81
dtype: int64


author
WaifuMango              1
ExistentialLiberty      1
Wait_Wut_Did_E_Say      1
Wait__No__What          1
WaitingForAFamilyMan    1
Waitingforaline         1
Wakandalady             1
Waketantrum             1
Walbaraa                1
------why------         1
dtype: int64


From the table we see that one of the author has the name 'AutoModerator'. This author is actually a bot that the moderators of a community use to inform users, manage community posts and engages with users. We will remove all 'AutoModerator' bots from the dataset. Additionally, we can also drop the subreddit type column since all of the observations are now 'public'.

In [59]:
publicIncelDF = publicIncelDF[publicIncelDF["author"] != 'AutoModerator']
publicIncelDF = publicIncelDF.drop('subreddit_type', axis=1)

Next we performed some feature engineering. We will

- find the length of each comment per user comment
- find the word frequency of incel words per comment
- find the incel_to_word ratio per user comment length

The word frequency feature is of note as we can use this feature to find the incel word frequency in each subreddit. THe incel word ratio is also a helpful metric for measuring how users comments are dominated by or sparsely used incel language.

In [60]:
publicIncelDF['clean_tokenize'] = publicIncelDF['body'].apply(lambda x: clean_tokenize(x))

In [61]:
publicIncelDF['tokenized_text'] = publicIncelDF['clean_tokenize'].apply(lambda a: sentence_tokenizer(a))

In [62]:
publicIncelDF['comment_length'] = publicIncelDF['body'].apply(word_count)

In [63]:
publicIncelDF['incel_word_freq'] = publicIncelDF['body'].str.count('|'.join(keywordsList))

In [64]:
publicIncelDF['incel_word_ratio'] = publicIncelDF['incel_word_freq'].div(publicIncelDF['comment_length'])

From the describe function we see that the average incel ratio is 0.009

In [65]:
publicIncelDF.describe()

Unnamed: 0,score,comment_length,incel_word_freq,incel_word_ratio
count,109289.0,109289.0,109289.0,109289.0
mean,0.597691,70.870078,2.561749,0.054644
std,4.416717,76.148606,2.629011,0.067756
min,-64.0,1.0,0.0,0.0
25%,-1.0,25.0,1.0,0.025
50%,0.0,49.0,2.0,0.04
75%,2.0,90.0,3.0,0.063063
max,203.0,1967.0,101.0,3.0


For the final steps of our feature engineering, we created two bodys of clean text. The first body was used for tokenization. The second clean text was used for sentiment analysis. THis required us to create two separate text cleaning functions. The two text clean function both convert the text into markdown and parsed using BeautifulSoup. Next both functions then lowercased the text, removed numbers and contractions, and were encoded and decoded from ASCII to ITF-8. The first clean function only removes some special characters from the text while the second clean function removes all punctuations and stopwords.

The clean_tokenize was then used to create a list of tokens of the sentence and appended to the dataframe.

Next we created sentiment flags per comment by using the 'clean_sentiment' text body and append to our dataframe. Additionally, we created a polarity and sentiment_flag columns. Both use the compund value from the polarity_score return value. We stored the compound value into the polarity column while for the sentiment_flag we created a if-else case to produce either a 'pos', 'neu' or 'neg' flag for each comment.

In [66]:
publicIncelDF['clean_sentiment'] = publicIncelDF['body'].apply(lambda y: clean_sentiment(y))

In [67]:
publicIncelDF['polarity'] = publicIncelDF['clean_sentiment'].apply(lambda z: get_polarity(z))

In [68]:
publicIncelDF['sentiment_flag'] = publicIncelDF['clean_sentiment'].apply(lambda b: sentiment_flag(b))

In [69]:
# most_active_users = publicIncelDF.groupby('author').size().sort_values(ascending=False)
# print(most_active_users.head(10))
# print('\n')
# print(most_active_users.tail(10))

# most_active_users.to_csv('../data/most_active_commenters.csv', header=['num_comments'], index=True)

In [70]:
# mean_wordlen_user = publicIncelDF[['author', 'comment_length', 'incel_word_freq']].groupby(['author']).mean()
# print(mean_wordlen_user.head(10))
# mean_wordlen_user.to_csv('../data/mean_user_comment_len.csv', header=['avg_comment_length', 'avg_incel_word_freq'], index=True)

In [71]:
publicIncelDF.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 109289 entries, 0 to 109436
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                109289 non-null  object 
 1   author            109289 non-null  object 
 2   subreddit         109289 non-null  object 
 3   body              109289 non-null  object 
 4   score             109289 non-null  int64  
 5   created_on        109289 non-null  object 
 6   clean_tokenize    109289 non-null  object 
 7   tokenized_text    109289 non-null  object 
 8   comment_length    109289 non-null  int64  
 9   incel_word_freq   109289 non-null  int64  
 10  incel_word_ratio  109289 non-null  float64
 11  clean_sentiment   109289 non-null  object 
 12  polarity          109289 non-null  float64
 13  sentiment_flag    109289 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 12.5+ MB


In [72]:
mean_score_subreddit = publicIncelDF[['subreddit', 'score', 'comment_length', 'incel_word_freq', 'incel_word_ratio']].groupby(['subreddit']).mean()
print(mean_score_subreddit.head(10))
mean_score_subreddit.to_csv('../data/mean_values_subreddit.csv', header=['avg_comment_score', 'avg_comment_length', 'avg_incel_word_freq', 'avg_incel_word_ratio'], index=True)

                 score  comment_length  incel_word_freq  incel_word_ratio
subreddit                                                                
2meirl4meirl  0.514432       49.298910         2.187941          0.082903
ADHD          0.714653       85.385604         3.020566          0.051423
Advice        0.323129       76.546939         2.643537          0.049899
Anxiety       0.357143       76.349490         2.790816          0.051585
AskWomen      1.082638       67.220243         2.664898          0.058186
Bumble        0.270254       51.786578         2.086457          0.063494
DeadBedrooms  0.808222       88.696977         3.074728          0.046237
MGTOW         0.448579       61.354005         2.423773          0.060063
Nudes        -0.628571       32.942857         1.742857          0.131943
Rateme        0.410309       41.942268         1.593814          0.071477


In [76]:
publicIncelDF.to_csv('../data/EDA_incelDF'+timestr+'.csv', 
                     columns=['created_on', 'subreddit', 'score', 
                              'clean_tokenize', 'tokenized_text',
                              'comment_length', 'incel_word_freq',
                              'incel_word_ratio', 'clean_sentiment',
                              'sentiment_flag', 'polarity'], 
                     index=False)