# CS6140 Project - Detection Of Sarcasm In Text

## 1. Data Preprocessing

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
from gensim.models import Word2Vec
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# Run these if not up to date
# nltk.download('stopwords')
# nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /Users/jackist/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jackist/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [2]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [3]:
data = data[['label', 'comment', 'subreddit', 'score', 'parent_comment']]

In [4]:
data.head(10)

Unnamed: 0,label,comment,subreddit,score,parent_comment
0,0,NC and NH.,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...
5,0,"I don't pay attention to her, but as long as s...",AskReddit,0,do you find ariana grande sexy ?
6,0,Trick or treating in general is just weird...,AskReddit,1,What's your weird or unsettling Trick or Treat...
7,0,Blade Mastery+Masamune or GTFO!,FFBraveExvius,2,Probably Sephiroth. I refuse to taint his grea...
8,0,"You don't have to, you have a good build, buy ...",pcmasterrace,1,What to upgrade? I have $500 to spend (mainly ...
9,0,I would love to see him at lolla.,Lollapalooza,2,Probably count Kanye out Since the rest of his...


In [5]:
data.isna().sum()

label              0
comment           53
subreddit          0
score              0
parent_comment     0
dtype: int64

In [6]:
data = data.dropna(axis=0)

In [7]:
data.isna().sum()

label             0
comment           0
subreddit         0
score             0
parent_comment    0
dtype: int64

In [9]:
data['comment_tokens'] = data['comment']
data['comment_tokens'] = data['comment_tokens'].str.lower()
data['comment_tokens'] = data['comment_tokens'].str.replace('[^\w\s]','', regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('can\'t','can not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('\'d',' would',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('couldn\'t','could not',regex = True)

In [10]:
data['parent_comment_tokens'] = data['parent_comment']
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.lower()
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('[^\w\s]','', regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('can\'t','can not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('\'d',' would',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('couldn\'t','could not',regex = True)

In [12]:
lemma = WordNetLemmatizer()
def apply_lemmatizer(sentence):
    return [lemma.lemmatize(token) for token in wordpunct_tokenize(sentence)]

In [17]:
data["comment_tokens"] = data.comment_tokens.apply(apply_lemmatizer)
data["parent_comment_tokens"] = data.parent_comment_tokens.apply(apply_lemmatizer)

In [22]:
data[["comment", "comment_tokens"]].head(10)

Unnamed: 0,comment,comment_tokens
0,NC and NH.,"[nc, and, nh]"
1,You do know west teams play against west teams...,"[you, do, know, west, team, play, against, wes..."
2,"They were underdogs earlier today, but since G...","[they, were, underdog, earlier, today, but, si..."
3,"This meme isn't funny none of the ""new york ni...","[this, meme, isnt, funny, none, of, the, new, ..."
4,I could use one of those tools.,"[i, could, use, one, of, those, tool]"
5,"I don't pay attention to her, but as long as s...","[i, dont, pay, attention, to, her, but, a, lon..."
6,Trick or treating in general is just weird...,"[trick, or, treating, in, general, is, just, w..."
7,Blade Mastery+Masamune or GTFO!,"[blade, masterymasamune, or, gtfo]"
8,"You don't have to, you have a good build, buy ...","[you, dont, have, to, you, have, a, good, buil..."
9,I would love to see him at lolla.,"[i, would, love, to, see, him, at, lolla]"


In [47]:
data['comment_tokens'].shape

(1010773,)

In [24]:
data[["parent_comment", "parent_comment_tokens"]].head(10)

Unnamed: 0,parent_comment,parent_comment_tokens
0,"Yeah, I get that argument. At this point, I'd ...","[yeah, i, get, that, argument, at, this, point..."
1,The blazers and Mavericks (The wests 5 and 6 s...,"[the, blazer, and, maverick, the, west, 5, and..."
2,They're favored to win.,"[theyre, favored, to, win]"
3,deadass don't kill my buzz,"[deadass, dont, kill, my, buzz]"
4,Yep can confirm I saw the tool they use for th...,"[yep, can, confirm, i, saw, the, tool, they, u..."
5,do you find ariana grande sexy ?,"[do, you, find, ariana, grande, sexy]"
6,What's your weird or unsettling Trick or Treat...,"[whats, your, weird, or, unsettling, trick, or..."
7,Probably Sephiroth. I refuse to taint his grea...,"[probably, sephiroth, i, refuse, to, taint, hi..."
8,What to upgrade? I have $500 to spend (mainly ...,"[what, to, upgrade, i, have, 500, to, spend, m..."
9,Probably count Kanye out Since the rest of his...,"[probably, count, kanye, out, since, the, rest..."


In [48]:
data['parent_comment_tokens'].shape

(1010773,)

In [26]:
# Since comments are already preprocessed and tokenized,
# the vectorizer only needs to take tokens as they are.
tfidfVectorizer = TfidfVectorizer(analyzer=lambda tokens: tokens)
corpus = pd.concat([data['comment_tokens'], data['parent_comment_tokens']])
tfidfMatrix = tfidfVectorizer.fit_transform(corpus)
vocabulary = tfidfVectorizer.vocabulary_

In [45]:
# the tfidf matrix has 2021546 rows,
# where the 0 ~ 1010772 rows represent original comments in the same order as those in dataset,
# and the 1010773 ~ 2021545 rows represent parent comments in the same order as those in dataset;
# and the tfidf matrix has 395961 columns, each represents a word in the vocabulary.
tfidfMatrix

<2021546x395961 sparse matrix of type '<class 'numpy.float64'>'
	with 29660642 stored elements in Compressed Sparse Row format>

In [50]:
# By using the following dict, we can obtain the index of a given word for querying tfidf matrix.
vocabulary

{'nc': 239750,
 'and': 38319,
 'nh': 242628,
 'you': 392472,
 'do': 111597,
 'know': 197955,
 'west': 381455,
 'team': 345253,
 'play': 270965,
 'against': 32301,
 'more': 232730,
 'than': 348058,
 'east': 119000,
 'right': 298452,
 'they': 349693,
 'were': 381308,
 'underdog': 365194,
 'earlier': 118727,
 'today': 353770,
 'but': 71136,
 'since': 320629,
 'gronks': 157855,
 'announcement': 39401,
 'this': 350282,
 'afternoon': 32143,
 'the': 348530,
 'vega': 372186,
 'line': 208050,
 'ha': 160542,
 'moved': 233943,
 'to': 353586,
 'patriot': 263570,
 '1': 1240,
 'meme': 223710,
 'isnt': 185230,
 'funny': 145320,
 'none': 245545,
 'of': 252047,
 'new': 241934,
 'york': 392386,
 'nigga': 243154,
 'one': 254363,
 'are': 43407,
 'i': 175470,
 'could': 93126,
 'use': 369466,
 'those': 350687,
 'tool': 354629,
 'dont': 112859,
 'pay': 263907,
 'attention': 47264,
 'her': 166956,
 'a': 27194,
 'long': 210547,
 'shes': 316672,
 'legal': 204702,
 'wouldnt': 387174,
 'kick': 195783,
 'out': 257

Once we have learned word vectors, to compute the sentence vector of a comment,
first fetch the corresponding row from the tfidf matrix (if it is the i-th original comment, then
it is the i-th row; if it is the i-th parent comment, then it is the (1010773 + i)-th row),
second for each word in the comment find its corresponding tfidf value from the row,
third compute the average of the word vectors weighted by corresponding tfidf values.

## 2. Models

## 3. Evaluation