# CS6140 Project - Detection Of Sarcasm In Text

## 1. Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
import gensim.downloader as api
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegressionCV

In [2]:
# Run if running notebook with vectors locally for first time. Files added to gitignore
# glove = api.load('glove-wiki-gigaword-300')
# glove.save('glovevectors.bin')

In [3]:
glove = KeyedVectors.load('glovevectors.bin')

In [4]:
# Run these if not up to date
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [5]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [6]:
data = data[['label', 'comment', 'subreddit', 'score', 'parent_comment']]

In [7]:
data.head(10)

Unnamed: 0,label,comment,subreddit,score,parent_comment
0,0,NC and NH.,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...
5,0,"I don't pay attention to her, but as long as s...",AskReddit,0,do you find ariana grande sexy ?
6,0,Trick or treating in general is just weird...,AskReddit,1,What's your weird or unsettling Trick or Treat...
7,0,Blade Mastery+Masamune or GTFO!,FFBraveExvius,2,Probably Sephiroth. I refuse to taint his grea...
8,0,"You don't have to, you have a good build, buy ...",pcmasterrace,1,What to upgrade? I have $500 to spend (mainly ...
9,0,I would love to see him at lolla.,Lollapalooza,2,Probably count Kanye out Since the rest of his...


In [8]:
data.isna().sum()

label              0
comment           53
subreddit          0
score              0
parent_comment     0
dtype: int64

In [9]:
data = data.dropna(axis=0)

In [10]:
data.isna().sum()

label             0
comment           0
subreddit         0
score             0
parent_comment    0
dtype: int64

In [11]:
data['comment_tokens'] = data['comment']
data['comment_tokens'] = data['comment_tokens'].str.lower()
data['comment_tokens'] = data['comment_tokens'].str.replace('can\'t','can not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('\'d',' would',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('couldn\'t','could not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('[^\w\s]','', regex = True)

In [12]:
data['parent_comment_tokens'] = data['parent_comment']
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.lower()
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('can\'t','can not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('\'d',' would',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('couldn\'t','could not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('[^\w\s]','', regex = True)

In [13]:
lemma = WordNetLemmatizer()
def apply_lemmatizer(sentence):
    return [lemma.lemmatize(token) for token in wordpunct_tokenize(sentence)]

In [14]:
data["comment_tokens"] = data.comment_tokens.apply(apply_lemmatizer)
data["parent_comment_tokens"] = data.parent_comment_tokens.apply(apply_lemmatizer)

In [15]:
data[["comment", "comment_tokens"]].head(10)

Unnamed: 0,comment,comment_tokens
0,NC and NH.,"[nc, and, nh]"
1,You do know west teams play against west teams...,"[you, do, know, west, team, play, against, wes..."
2,"They were underdogs earlier today, but since G...","[they, were, underdog, earlier, today, but, si..."
3,"This meme isn't funny none of the ""new york ni...","[this, meme, isnt, funny, none, of, the, new, ..."
4,I could use one of those tools.,"[i, could, use, one, of, those, tool]"
5,"I don't pay attention to her, but as long as s...","[i, dont, pay, attention, to, her, but, a, lon..."
6,Trick or treating in general is just weird...,"[trick, or, treating, in, general, is, just, w..."
7,Blade Mastery+Masamune or GTFO!,"[blade, masterymasamune, or, gtfo]"
8,"You don't have to, you have a good build, buy ...","[you, dont, have, to, you, have, a, good, buil..."
9,I would love to see him at lolla.,"[i, would, love, to, see, him, at, lolla]"


In [16]:
data['comment_tokens'].shape

(1010773,)

In [17]:
data[["parent_comment", "parent_comment_tokens"]].head(10)

Unnamed: 0,parent_comment,parent_comment_tokens
0,"Yeah, I get that argument. At this point, I'd ...","[yeah, i, get, that, argument, at, this, point..."
1,The blazers and Mavericks (The wests 5 and 6 s...,"[the, blazer, and, maverick, the, west, 5, and..."
2,They're favored to win.,"[theyre, favored, to, win]"
3,deadass don't kill my buzz,"[deadass, dont, kill, my, buzz]"
4,Yep can confirm I saw the tool they use for th...,"[yep, can, confirm, i, saw, the, tool, they, u..."
5,do you find ariana grande sexy ?,"[do, you, find, ariana, grande, sexy]"
6,What's your weird or unsettling Trick or Treat...,"[whats, your, weird, or, unsettling, trick, or..."
7,Probably Sephiroth. I refuse to taint his grea...,"[probably, sephiroth, i, refuse, to, taint, hi..."
8,What to upgrade? I have $500 to spend (mainly ...,"[what, to, upgrade, i, have, 500, to, spend, m..."
9,Probably count Kanye out Since the rest of his...,"[probably, count, kanye, out, since, the, rest..."


In [18]:
data['parent_comment_tokens'].shape

(1010773,)

In [19]:
X = data[['comment_tokens', 'parent_comment_tokens']]

In [20]:
X.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[nc, and, nh]","[yeah, i, get, that, argument, at, this, point..."
1,"[you, do, know, west, team, play, against, wes...","[the, blazer, and, maverick, the, west, 5, and..."
2,"[they, were, underdog, earlier, today, but, si...","[theyre, favored, to, win]"
3,"[this, meme, isnt, funny, none, of, the, new, ...","[deadass, dont, kill, my, buzz]"
4,"[i, could, use, one, of, those, tool]","[yep, can, confirm, i, saw, the, tool, they, u..."
5,"[i, dont, pay, attention, to, her, but, a, lon...","[do, you, find, ariana, grande, sexy]"
6,"[trick, or, treating, in, general, is, just, w...","[whats, your, weird, or, unsettling, trick, or..."
7,"[blade, masterymasamune, or, gtfo]","[probably, sephiroth, i, refuse, to, taint, hi..."
8,"[you, dont, have, to, you, have, a, good, buil...","[what, to, upgrade, i, have, 500, to, spend, m..."
9,"[i, would, love, to, see, him, at, lolla]","[probably, count, kanye, out, since, the, rest..."


In [21]:
Y = data[['label']]

In [22]:
Y.tail(10)

Unnamed: 0,label
1010816,1
1010817,1
1010818,1
1010819,1
1010820,1
1010821,1
1010822,1
1010823,1
1010824,1
1010825,1


In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size= 0.6)

In [24]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size = 0.5) 

In [25]:
X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [26]:
X_train.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[ok, we, may, need, a, riot, forgot, mega, thr...","[riot, forgot, about, bard, chime, spawning, s..."
1,[handtyped],"[yeah, if, you, can, not, figure, out, how, to..."
2,"[so, youre, saying, so, should, abuse, child, ...","[watch, porn, totally, normal, outlet, for, yo..."
3,"[of, course, it, doe]","[funny, it, prompt, more, when, i, walk, than,..."
4,"[havoc, demon, blade, so, fun]","[i, would, take, havoc, over, outlaw, any, day..."
5,"[you, forgot]","[have, you, determined, that, from, your, exte..."
6,"[if, he, a, hetero, male, without, a, vasectom...","[i, think, you, could, have, gone, without, in..."
7,"[just, take, a, boat, im, sure, they, would, l...","[a, plane, ticket, to, australia, omg]"
8,"[but, i, guess, there, no, problem, in, writin...","[that, you, can, be, a, cunt, regardless, of, ..."
9,"[so, what, youre, saying, is, that, in, spite,...","[i, adore, zizek, but, i, think, his, strength..."


In [27]:
len(X_train)

606463

In [28]:
data.shape

(1010773, 7)

In [29]:
# Since comments are already preprocessed and tokenized,
# the vectorizer only needs to take tokens as they are.
tfidfVectorizer = TfidfVectorizer(analyzer=lambda tokens: tokens)
corpus = pd.concat([X_train['comment_tokens'], X_train['parent_comment_tokens']])
tfidfMatrix = tfidfVectorizer.fit_transform(corpus)
vocabulary = tfidfVectorizer.vocabulary_

In [30]:
tfidfMatrix.shape

(1212926, 284405)

In [31]:
# the tfidf matrix has 2021546 rows,
# where the 0 ~ 606462 rows represent original comments in the same order as those in dataset,
# and the 606463 ~ 1212926 rows represent parent comments in the same order as those in dataset;
# and the tfidf matrix has 395961 columns, each represents a word in the vocabulary.
tfidfMatrix

<1212926x284405 sparse matrix of type '<class 'numpy.float64'>'
	with 17837027 stored elements in Compressed Sparse Row format>

In [32]:
# By using the following dict, we can obtain the index of a given word for querying tfidf matrix.
vocabulary

{'ok': 181271,
 'we': 272360,
 'may': 158055,
 'need': 172153,
 'a': 18980,
 'riot': 214362,
 'forgot': 100715,
 'mega': 159701,
 'thread': 251714,
 'soon': 234710,
 'handtyped': 116402,
 'so': 233427,
 'youre': 282123,
 'saying': 220977,
 'should': 228390,
 'abuse': 20164,
 'child': 57400,
 'can': 52270,
 'have': 117676,
 'more': 166675,
 'porn': 196398,
 'of': 180477,
 'course': 66495,
 'it': 132925,
 'doe': 79844,
 'havoc': 117755,
 'demon': 74673,
 'blade': 43281,
 'fun': 103694,
 'you': 281931,
 'if': 126544,
 'he': 118107,
 'hetero': 119908,
 'male': 154945,
 'without': 276304,
 'vasectomy': 266751,
 'and': 26926,
 'his': 121322,
 'reproductive': 211499,
 'fate': 95368,
 'were': 273545,
 'to': 253605,
 'be': 38799,
 'placed': 193673,
 'in': 128175,
 'the': 250000,
 'hand': 116199,
 'another': 27839,
 'would': 277782,
 'indeed': 128739,
 'woman': 276750,
 'whose': 274891,
 'identity': 126278,
 'at': 32759,
 'this': 251240,
 'point': 195167,
 'is': 132341,
 'arbitrary': 30306,
 'ak

Once we have learned word vectors, to compute the sentence vector of a comment,
first fetch the corresponding row from the tfidf matrix (if it is the i-th original comment, then
it is the i-th row; if it is the i-th parent comment, then it is the (1010773 + i)-th row),
second for each word in the comment find its corresponding tfidf value from the row,
third compute the average of the word vectors weighted by corresponding tfidf values.

In [33]:
tfidfMatrix.shape

(1212926, 284405)

In [34]:
tfidfMatrix[0,249920]

0.0

In [35]:
vocabulary['thats']

249854

In [36]:
tfidfMatrixtest = tfidfVectorizer.transform(pd.concat([X_test['comment_tokens'], X_test['parent_comment_tokens']]))
tfidfMatrixvalid = tfidfVectorizer.transform(pd.concat([X_val['comment_tokens'], X_val['parent_comment_tokens']]))

In [37]:
tfidfMatrixtest.shape

(404310, 284405)

In [38]:
X_test.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[knew, it, fuckin, pc, fag, 247, on, pc, and, ...","[the, only, thing, im, heavily, addicted, to, ..."
1,"[they, say, early, second, or, pile, of, mid, ...","[no, and, the, article, doesnt, say, that, eit..."
2,"[fire, up, the, oven]","[so, what, to, do, eugenics]"
3,[lmao],"[risrael, rpalestine, take, note]"
4,"[rngesus, ha, blessed, you, boi]",[10]
5,"[final, fantasy, ex, sound, so, much, cooler, ...","[i, prefer, the, pronunciation, ex, post, soun..."
6,"[yeah, starbound, is, a, dope, simulator]","[so, the, space, exploration, is, too, realist..."
7,"[yes, because, american, historically, lack, t...","[yeah, but, you, guy, get, vacation, and, trav..."
8,"[it, irrational, and, allows, men, to, be, led...","[nationalism, is, badwhy]"
9,"[so, similar, to, wine, on, linux]","[it, actually, different, an, emulator, is, a,..."


In [39]:
vocabulary['for']

100394

In [40]:
tfidfMatrixtest[0, 100573]

0.0

In [56]:
def generate_dataset(X, tfidfmat):
    comment_featurevec = []
    parent_featurevec = []
    
    for index, sample in X.iterrows():
        
        comment_vec = np.zeros(300)
        parent_comment_vec = np.zeros(300)
        comment_tokens = sample['comment_tokens']
        parent_comment_tokens = sample['parent_comment_tokens']
        counter1 = 0
        
        for token in comment_tokens:
            
            if token in glove and token in vocabulary:
                counter1 += 1
                comment_vec += tfidfmat[index, vocabulary[token]] * glove[token]
        
        counter2 = 0
        for token in parent_comment_tokens:
            
            if token in glove and token in vocabulary:
                counter2 += 1 
                parent_comment_vec += tfidfmat[index+len(X), vocabulary[token]] * glove[token]
        
        if counter1 != 0:
            comment_featurevec.append(comment_vec/counter1)
        else:
            comment_featurevec.append(comment_vec)
        
        if counter2 != 0:
            parent_featurevec.append(parent_comment_vec/counter2)
        else:
            parent_featurevec.append(parent_comment_vec)
            
        
    #print(len(comment_featurevec))
    X['comment_weighted_vec'] = comment_featurevec
    X['parent_weighted_vec'] = parent_featurevec
    new_df = pd.DataFrame()
        
    return X
    
    
    

In [42]:
X_train = generate_dataset(X_train, tfidfMatrix)

ValueError: Length of values (599058) does not match length of index (606463)

In [None]:
X_val = generate_dataset(X_val, tfidfMatrixval)

In [58]:
X_test = generate_dataset(X_test, tfidfMatrixtest)

In [60]:
X_train = X_train[['comment_weighted_vec', 'parent_weighted_vec']]
X_val = X_val[['comment_weighted_vec', 'parent_weighted_vec']]
X_test = X_test[['comment_weighted_vec', 'parent_weighted_vec']]

In [80]:
def featurize(dataset):
    x = pd.DataFrame(dataset['parent_weighted_vec'].tolist(), columns = [i for i in range(300)])
    y = pd.DataFrame(dataset['comment_weighted_vec'].tolist(), columns = [i for i in range(300, 600)])
    return pd.concat([x, y], axis = 1)

In [81]:
X_train = featurize(X_train)
X_val = featurize(X_val)
X_test = featurize(X_test)

In [87]:
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X_train, y_train.values.ravel())

In [88]:
clf.score(X_test, y_test)

0.9

## 2. Models

## 3. Evaluation