# Data Preparation

## (Optional) Install Google Cloud Language API for Entity-Sentiment

In [7]:
!pip install google-cloud-language

Collecting google-cloud-language
  Downloading google_cloud_language-2.12.0-py2.py3-none-any.whl (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.8/137.8 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0
  Downloading google_api_core-2.15.0-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.0/122.0 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting proto-plus<2.0.0dev,>=1.22.3
  Downloading proto_plus-1.23.0-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.8/48.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting googleapis-common-protos<2.0.dev0,>=1.56.2
  Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.7/228.7 kB[0m [31m46.

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import torch
import numpy as np
from scipy.special import softmax

## Data cleaning and sampling of equal number of offensive and non-offensive jokes

In [19]:
df = pd.read_csv('reddit_jokes.csv')
# only keep non-offensive and offensive jokes
df = df[(df["joke_type"] == 0) | (df["joke_type"] == 1)]

df = df[df['selftext'].str.split().str.len() < 30]
df = df[df['title'].str.split().str.len() < 30]

df = df[df['selftext'].str.contains('\n') == False]
df = df[df['title'].str.contains('\n') == False]

df = df[df['selftext'].str.split().str.len() > 4]

df = df[df['selftext'].str.contains('\[.*\]\(.*\)') == False]

def unigram_overlap(row):
    setup = row['title'].split()
    punchline = row['selftext'].split()
    overlap = 0
    for word in setup:
        if word in punchline:
            overlap += 1
    return overlap / len(setup)

def bigram_overlap(row):
    setup = row['title'].split()
    punchline = row['selftext'].split()
    overlap = 0
    for i in range(len(setup) - 1):
        if setup[i] + " " + setup[i + 1] in punchline:
            overlap += 1
    return overlap / len(setup)

def trigram_overlap(row):
    setup = row['title'].split()
    punchline = row['selftext'].split()
    overlap = 0
    for i in range(len(setup) - 2):
        if setup[i] + " " + setup[i + 1] + " " + setup[i + 2] in punchline:
            overlap += 1
    return overlap / len(setup)

df["unigram_overlap"] = df.apply(lambda row: unigram_overlap(row), axis=1)
df["bigram_overlap"] = df.apply(lambda row: bigram_overlap(row), axis=1)
df["trigram_overlap"] = df.apply(lambda row: trigram_overlap(row), axis=1)

df = df[df["unigram_overlap"] < 0.5]
df = df[df["bigram_overlap"] < 0.5]
df = df[df["trigram_overlap"] < 0.5]

minimum_class_size = min(len(df[df["joke_type"] == 0]), len(df[df["joke_type"] == 1]))

# We construct new datafram with equal number of offensive and non-offensive jokes
df = pd.concat([df[df["joke_type"] == 0].sample(minimum_class_size), df[df["joke_type"] == 1].sample(minimum_class_size)])
df = df.drop(['unigram_overlap', 'bigram_overlap', 'trigram_overlap'], axis=1)
df.to_csv('cleaned_data.csv')

## Enhance data with Entropy and Surprisal

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
model = GPT2LMHeadModel.from_pretrained('gpt2-large').cuda()

def calculate_surprisal_entropy(setup, punchline):
    joke_encoding = tokenizer.encode(setup + ' ' + punchline, truncation=True, max_length=1024)
    setup_encoding = tokenizer.encode(setup, truncation=True, max_length=1024)

    diff_encoding = joke_encoding[len(setup_encoding):]
    joke_ids = torch.tensor([joke_encoding]).cuda()
	
    with torch.no_grad():
        outputs = model(joke_ids)
        logits = outputs.logits
        
    start_index = len(setup_encoding)
    entropies = []
    surprisals = []

    for i in range(start_index, len(joke_encoding) - 1):
        token_logits = logits[0, i].cpu().numpy()
        token_probs = softmax(token_logits)
        
        token_entropy = -np.sum(token_probs * np.log(token_probs))
        entropies.append(token_entropy)
        
        token_surprisal = -np.log(token_probs[diff_encoding[i - start_index]])
        surprisals.append(token_surprisal)

    avg_surprisal = np.mean(surprisals)
    avg_entropy = np.mean(entropies)

    return avg_surprisal, avg_entropy

In [5]:
df = pd.read_csv('cleaned_data.csv')
df["surprisal"], df["entropy"] = zip(*df.apply(lambda row: calculate_surprisal_entropy(row["title"], row["selftext"]), axis=1))

df["surprisal"] = (df['surprisal'] - df['surprisal'].mean()) / df['surprisal'].std() 
df["entropy"] = (df['entropy'] - df['entropy'].mean()) / df['entropy'].std() 
df.to_csv('cleaned_data_entropy_surprisal.csv')

## Find Entity-sentiment for each joke using Google API

In [8]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="cred.json"

In [9]:
from google.cloud import language_v1

def analyze_entity_sentiment(text_content):
    client = language_v1.LanguageServiceClient()
    type_ = language_v1.types.Document.Type.PLAIN_TEXT
    language = "en"
    document = {"content": text_content, "type_": type_, "language": language}
    encoding_type = language_v1.EncodingType.UTF8

    response = client.analyze_entity_sentiment(
        request={"document": document, "encoding_type": encoding_type}
    )

    minimum_score = 2
    for entity in response.entities:
        if language_v1.Entity.Type(entity.type_).name in ('PERSON', 'LOCATION', 'ORGANIZATION'):
            if entity.sentiment.score <= minimum_score:
                minimum_entity_name = entity.name
                minimum_score = entity.sentiment.score

    return minimum_score

In [11]:
df = pd.read_csv("cleaned_data_entropy_surprisal.csv")
df["minimum_sentiment_score"] = None

for i in range(len(df)):
    print(i/len(df)*100)
    minimum_score = analyze_entity_sentiment(df["title"].iloc[i] + " " + df["selftext"].iloc[i])
    if minimum_score < 2:
        df["minimum_sentiment_score"].iloc[i] = minimum_score
        
df.to_csv('cleaned_data_entropy_surprisal_sentiment.csv')

0.0
0.01721763085399449


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["minimum_sentiment_score"].iloc[i] = minimum_score


0.03443526170798898
0.05165289256198347
0.06887052341597796
0.08608815426997246
0.10330578512396695
0.12052341597796143
0.13774104683195593
0.15495867768595042
0.17217630853994492
0.1893939393939394
0.2066115702479339
0.2238292011019284
0.24104683195592286
0.2582644628099174
0.27548209366391185
0.2926997245179063
0.30991735537190085
0.3271349862258953
0.34435261707988984
0.3615702479338843
0.3787878787878788
0.3960055096418733
0.4132231404958678
0.43044077134986225
0.4476584022038568
0.46487603305785125
0.4820936639118457
0.49931129476584024
0.5165289256198348
0.5337465564738292
0.5509641873278237
0.5681818181818182
0.5853994490358126
0.6026170798898072
0.6198347107438017
0.6370523415977961
0.6542699724517906
0.6714876033057852
0.6887052341597797
0.7059228650137741
0.7231404958677686
0.7403581267217632
0.7575757575757576
0.7747933884297521
0.7920110192837466
0.809228650137741
0.8264462809917356
0.8436639118457301
0.8608815426997245
0.878099173553719
0.8953168044077136
0.912534435261708