In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer

In [2]:
reddit_url = "https://www.reddit.com/r/IncelTear/wiki/incel-terminology/#wiki_appendix"
req = requests.get(reddit_url).text

red_soup = BeautifulSoup(req, 'html.parser')   

In [3]:
# select the div that contains the definitions
raw_definitions = red_soup.find('div', class_='md wiki').select("p")

In [4]:
# pop first value
raw_definitions.pop(0)

# next select the first 106 rows
raw_definitions = raw_definitions[0:106]

In [5]:
# Define the preprocessing and sentimetn analysis functions
def preprocess_text(text):

    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

analyzer = SentimentIntensityAnalyzer()
def get_sentiment(text):
    
    # Calculate sentiment in text
    scores = analyzer.polarity_scores(text)
    return scores

In [6]:
glossary = { 'Word': [], 'Definition': [], 'Cleaned Text': [], 'Sentiment': [], 'Score': [] }
i = 0
# Iterate through the definitions
for tag in raw_definitions:
    i += 1
    # Split definition into two sections
    split_list = tag.text.split(':')
    
    # Store separated strings
    word = split_list[0].strip()
    definition = split_list[1].strip()
    
    # Clean the definition
    cleaned_text = preprocess_text(definition)
    
    # Get the sentiment polarity
    sentiment_scores = get_sentiment(cleaned_text)
    
    # Infer the sentiment flag
    sentiment_flag = ''
    if(sentiment_scores['pos'] > sentiment_scores['neg']):
        sentiment_flag = 'positive'
    elif(sentiment_scores['pos'] < sentiment_scores['neg']):
        sentiment_flag = 'negative'
    else:
        sentiment_flag = 'neutral'
    
    # Finally, store values in dictionary
    glossary['Word'].append(word)
    glossary['Definition'].append(definition)
    glossary["Cleaned Text"].append(cleaned_text)
    glossary['Sentiment'].append(sentiment_flag)
    glossary['Score'].append(sentiment_scores['pos'] if sentiment_flag == 'positive' else\
        (sentiment_scores['neg'] if sentiment_flag == 'negative' else sentiment_scores['neu']))
    
    

In [7]:
# Convert the dictionary into a datafram
incelDF = pd.DataFrame.from_dict(glossary)
incelDF.to_csv('./incelDF.csv', index=False)