# CS6140 Project - Detection Of Sarcasm In Text

## 1. Data Preprocessing

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
import gensim.downloader as api
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression

In [2]:
# Run if running notebook with vectors locally for first time. Files added to gitignore
# glove = api.load('glove-wiki-gigaword-300')
# glove.save('glovevectors.bin')

In [3]:
glove = KeyedVectors.load('glovevectors.bin')

In [4]:
# Run these if not up to date
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [5]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [6]:
data = data[['label', 'comment', 'subreddit', 'score', 'parent_comment']]

In [7]:
data.head(10)

Unnamed: 0,label,comment,subreddit,score,parent_comment
0,0,NC and NH.,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...
5,0,"I don't pay attention to her, but as long as s...",AskReddit,0,do you find ariana grande sexy ?
6,0,Trick or treating in general is just weird...,AskReddit,1,What's your weird or unsettling Trick or Treat...
7,0,Blade Mastery+Masamune or GTFO!,FFBraveExvius,2,Probably Sephiroth. I refuse to taint his grea...
8,0,"You don't have to, you have a good build, buy ...",pcmasterrace,1,What to upgrade? I have $500 to spend (mainly ...
9,0,I would love to see him at lolla.,Lollapalooza,2,Probably count Kanye out Since the rest of his...


In [8]:
data.isna().sum()

label              0
comment           53
subreddit          0
score              0
parent_comment     0
dtype: int64

In [9]:
data = data.dropna(axis=0)

In [10]:
data.isna().sum()

label             0
comment           0
subreddit         0
score             0
parent_comment    0
dtype: int64

In [11]:
data['comment_tokens'] = data['comment']
data['comment_tokens'] = data['comment_tokens'].str.lower()
data['comment_tokens'] = data['comment_tokens'].str.replace('can\'t','can not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('\'d',' would',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('couldn\'t','could not',regex = True)
data['comment_tokens'] = data['comment_tokens'].str.replace('[^\w\s]','', regex = True)

In [12]:
data['parent_comment_tokens'] = data['parent_comment']
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.lower()
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('can\'t','can not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('\'d',' would',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('wouldn\'t','would not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('couldn\'t','could not',regex = True)
data['parent_comment_tokens'] = data['parent_comment_tokens'].str.replace('[^\w\s]','', regex = True)

In [13]:
lemma = WordNetLemmatizer()
def apply_lemmatizer(sentence):
    return [lemma.lemmatize(token) for token in wordpunct_tokenize(sentence)]

In [14]:
data["comment_tokens"] = data.comment_tokens.apply(apply_lemmatizer)
data["parent_comment_tokens"] = data.parent_comment_tokens.apply(apply_lemmatizer)

In [15]:
data[["comment", "comment_tokens"]].head(10)

Unnamed: 0,comment,comment_tokens
0,NC and NH.,"[nc, and, nh]"
1,You do know west teams play against west teams...,"[you, do, know, west, team, play, against, wes..."
2,"They were underdogs earlier today, but since G...","[they, were, underdog, earlier, today, but, si..."
3,"This meme isn't funny none of the ""new york ni...","[this, meme, isnt, funny, none, of, the, new, ..."
4,I could use one of those tools.,"[i, could, use, one, of, those, tool]"
5,"I don't pay attention to her, but as long as s...","[i, dont, pay, attention, to, her, but, a, lon..."
6,Trick or treating in general is just weird...,"[trick, or, treating, in, general, is, just, w..."
7,Blade Mastery+Masamune or GTFO!,"[blade, masterymasamune, or, gtfo]"
8,"You don't have to, you have a good build, buy ...","[you, dont, have, to, you, have, a, good, buil..."
9,I would love to see him at lolla.,"[i, would, love, to, see, him, at, lolla]"


In [16]:
data['comment_tokens'].shape

(1010773,)

In [17]:
data[["parent_comment", "parent_comment_tokens"]].head(10)

Unnamed: 0,parent_comment,parent_comment_tokens
0,"Yeah, I get that argument. At this point, I'd ...","[yeah, i, get, that, argument, at, this, point..."
1,The blazers and Mavericks (The wests 5 and 6 s...,"[the, blazer, and, maverick, the, west, 5, and..."
2,They're favored to win.,"[theyre, favored, to, win]"
3,deadass don't kill my buzz,"[deadass, dont, kill, my, buzz]"
4,Yep can confirm I saw the tool they use for th...,"[yep, can, confirm, i, saw, the, tool, they, u..."
5,do you find ariana grande sexy ?,"[do, you, find, ariana, grande, sexy]"
6,What's your weird or unsettling Trick or Treat...,"[whats, your, weird, or, unsettling, trick, or..."
7,Probably Sephiroth. I refuse to taint his grea...,"[probably, sephiroth, i, refuse, to, taint, hi..."
8,What to upgrade? I have $500 to spend (mainly ...,"[what, to, upgrade, i, have, 500, to, spend, m..."
9,Probably count Kanye out Since the rest of his...,"[probably, count, kanye, out, since, the, rest..."


In [18]:
data['parent_comment_tokens'].shape

(1010773,)

In [19]:
X = data[['comment_tokens', 'parent_comment_tokens']]

In [20]:
X.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[nc, and, nh]","[yeah, i, get, that, argument, at, this, point..."
1,"[you, do, know, west, team, play, against, wes...","[the, blazer, and, maverick, the, west, 5, and..."
2,"[they, were, underdog, earlier, today, but, si...","[theyre, favored, to, win]"
3,"[this, meme, isnt, funny, none, of, the, new, ...","[deadass, dont, kill, my, buzz]"
4,"[i, could, use, one, of, those, tool]","[yep, can, confirm, i, saw, the, tool, they, u..."
5,"[i, dont, pay, attention, to, her, but, a, lon...","[do, you, find, ariana, grande, sexy]"
6,"[trick, or, treating, in, general, is, just, w...","[whats, your, weird, or, unsettling, trick, or..."
7,"[blade, masterymasamune, or, gtfo]","[probably, sephiroth, i, refuse, to, taint, hi..."
8,"[you, dont, have, to, you, have, a, good, buil...","[what, to, upgrade, i, have, 500, to, spend, m..."
9,"[i, would, love, to, see, him, at, lolla]","[probably, count, kanye, out, since, the, rest..."


In [21]:
Y = data[['label']]

In [22]:
Y.tail(10)

Unnamed: 0,label
1010816,1
1010817,1
1010818,1
1010819,1
1010820,1
1010821,1
1010822,1
1010823,1
1010824,1
1010825,1


In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, train_size= 0.6)

In [24]:
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, train_size = 0.5) 

In [25]:
X_train = X_train.reset_index(drop = True)
X_val = X_val.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [26]:
X_train.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[it, not, even, underclocking, itself, i, have...","[this, intel, like, to, almost, tie, that, fan..."
1,"[just, another, example, of, the, most, transp...","[court, order, new, clinton, email, production..."
2,"[only, online, poll, are, accurate, because, t...","[which, poll, do, you, personally, trust, just..."
3,"[lol, my, good, friend, picked, the, jet, to, ...","[well, the, duck, won, but, i, got, 13, point,..."
4,"[she, look, very, friendly]","[what, wa, your, reaction, when, you, first, s..."
5,"[i, know, where, im, going, next, year]","[drunk, in, britain]"
6,"[nah, man, just, try, it, once, or, twice, you...","[this, may, sound, stupid, but, is, there, a, ..."
7,"[why, think, when, you, can, hate]","[where, is, the, evidence, of, what, youre, sa..."
8,"[dont, forget, the, hilarious, back, to, the, ...","[having, i, didnt, know, girl, played, dota, s..."
9,"[kill, mean, nothing]","[tbh, he, also, had, the, most, kill, in, the,..."


In [27]:
len(X_train)

606463

In [28]:
data.shape

(1010773, 7)

In [29]:
# Since comments are already preprocessed and tokenized,
# the vectorizer only needs to take tokens as they are.
tfidfVectorizer = TfidfVectorizer(analyzer=lambda tokens: tokens)
corpus = pd.concat([X_train['comment_tokens'], X_train['parent_comment_tokens']])
tfidfMatrix = tfidfVectorizer.fit_transform(corpus)
vocabulary = tfidfVectorizer.vocabulary_

In [30]:
tfidfMatrix.shape

(1212926, 284337)

In [31]:
# the tfidf matrix has 2021546 rows,
# where the 0 ~ 606462 rows represent original comments in the same order as those in dataset,
# and the 606463 ~ 1212926 rows represent parent comments in the same order as those in dataset;
# and the tfidf matrix has 395961 columns, each represents a word in the vocabulary.
tfidfMatrix

<1212926x284337 sparse matrix of type '<class 'numpy.float64'>'
	with 17817013 stored elements in Compressed Sparse Row format>

In [32]:
# By using the following dict, we can obtain the index of a given word for querying tfidf matrix.
vocabulary

{'it': 133232,
 'not': 177762,
 'even': 91109,
 'underclocking': 261790,
 'itself': 133568,
 'i': 125850,
 'have': 117936,
 'both': 46302,
 'ai': 23356,
 'suite': 242569,
 '3': 9294,
 'and': 26947,
 'hwinfo64': 125325,
 'reporting': 211383,
 'temp': 248371,
 'avg': 34571,
 '42c': 12064,
 'fan': 94635,
 'spinning': 236598,
 'at': 32830,
 'all': 24699,
 'pc': 189216,
 'complete': 62982,
 'idle': 126654,
 'just': 137592,
 'another': 27791,
 'example': 91779,
 'of': 180500,
 'the': 249849,
 'most': 167287,
 'transparent': 256029,
 'politician': 195666,
 'in': 128446,
 'history': 121662,
 'only': 182514,
 'online': 182481,
 'poll': 195742,
 'are': 30569,
 'accurate': 20616,
 'because': 39270,
 'they': 250644,
 'get': 107955,
 'pol': 195465,
 'seal': 223047,
 'approval': 29957,
 'lol': 150695,
 'my': 169968,
 'good': 110793,
 'friend': 102480,
 'picked': 192439,
 'jet': 135578,
 'to': 253454,
 'win': 275632,
 'cup': 69476,
 'his': 121581,
 'bracket': 46904,
 'challenge': 55927,
 'oops': 1828

Once we have learned word vectors, to compute the sentence vector of a comment,
first fetch the corresponding row from the tfidf matrix (if it is the i-th original comment, then
it is the i-th row; if it is the i-th parent comment, then it is the (1010773 + i)-th row),
second for each word in the comment find its corresponding tfidf value from the row,
third compute the average of the word vectors weighted by corresponding tfidf values.

In [33]:
tfidfMatrix.shape

(1212926, 284337)

In [34]:
tfidfMatrix[0,249920]

0.0

In [35]:
vocabulary['thats']

249707

In [36]:
tfidfMatrixtest = tfidfVectorizer.transform(pd.concat([X_test['comment_tokens'], X_test['parent_comment_tokens']]))
tfidfMatrixvalid = tfidfVectorizer.transform(pd.concat([X_val['comment_tokens'], X_val['parent_comment_tokens']]))

In [37]:
tfidfMatrixtest.shape

(404310, 284337)

In [38]:
X_test.head(10)

Unnamed: 0,comment_tokens,parent_comment_tokens
0,"[good, luck, friend, u, deserve, it, lt3]","[very, exciting, stuff, we, will, do, our, bes..."
1,"[oh, man, what, a, great, video, that, wa, from]","[staaaap, stahhhpstaaaaahp, why, would, you, d..."
2,"[something, something, private, entity, bullsh...","[the, democrat, arent, even, pretending, to, b..."
3,"[so, accurate, i, can, now, hear, it, in, my, ...","[more, like, ah, ah, ah, ahhh]"
4,"[some, of, you, guy, are, making, me, feel, a,...","[gamers, what, is, the, video, game, you, have..."
5,"[what, if, it, just, a, big, conspiracy, to, b...","[well, the, review, are, starting, to, post, a..."
6,[burr],"[this, tape, is, so, cold, got, me, feeling, l..."
7,[probably],"[well, we, know, it, more, than, 11, million, ..."
8,"[closing, shop, on, the, very, day, when, peop...","[why, arent, any, of, the, shop, open, on, box..."
9,"[thats, how, all, woman, are, before, they, se...","[this, will, get, buried, but, i, have, to, ge..."


In [39]:
vocabulary['for']

100482

In [40]:
tfidfMatrixtest[0, 100573]

0.0

In [41]:
def generate_dataset(X, tfidfmat):
    comment_featurevec = []
    parent_featurevec = []
    
    for index, sample in X.iterrows():
        
        comment_vec = np.zeros(300)
        parent_comment_vec = np.zeros(300)
        comment_tokens = sample['comment_tokens']
        parent_comment_tokens = sample['parent_comment_tokens']
        counter1 = 0
        
        for token in comment_tokens:
            
            if token in glove and token in vocabulary:
                counter1 += 1
                comment_vec += tfidfmat[index, vocabulary[token]] * glove[token]
        
        counter2 = 0
        for token in parent_comment_tokens:
            
            if token in glove and token in vocabulary:
                counter2 += 1 
                parent_comment_vec += tfidfmat[index+len(X), vocabulary[token]] * glove[token]
        
        if counter1 != 0:
            comment_featurevec.append(comment_vec/counter1)
        else:
            comment_featurevec.append(comment_vec)
        
        if counter2 != 0:
            parent_featurevec.append(parent_comment_vec/counter2)
        else:
            parent_featurevec.append(parent_comment_vec)
            
        
    #print(len(comment_featurevec))
    X['comment_weighted_vec'] = comment_featurevec
    X['parent_weighted_vec'] = parent_featurevec
    new_df = pd.DataFrame()
        
    return X
    
    
    

In [42]:
X_train = generate_dataset(X_train, tfidfMatrix)

In [44]:
X_val = generate_dataset(X_val, tfidfMatrixvalid)

In [45]:
X_test = generate_dataset(X_test, tfidfMatrixtest)

In [46]:
X_train = X_train[['comment_weighted_vec', 'parent_weighted_vec']]
X_val = X_val[['comment_weighted_vec', 'parent_weighted_vec']]
X_test = X_test[['comment_weighted_vec', 'parent_weighted_vec']]

In [47]:
def featurize(dataset):
    x = pd.DataFrame(dataset['parent_weighted_vec'].tolist(), columns = [i for i in range(300)])
    y = pd.DataFrame(dataset['comment_weighted_vec'].tolist(), columns = [i for i in range(300, 600)])
    return pd.concat([x, y], axis = 1)

In [48]:
X_train = featurize(X_train)
X_val = featurize(X_val)
X_test = featurize(X_test)

In [53]:
clf = LogisticRegression(max_iter = 500).fit(X_train, y_train.values.ravel())

In [54]:
clf.score(X_test, y_test)

0.5953797828399001

In [57]:
clf.score(X_val, y_val)

0.5942618287947367

## 2. Models

## 3. Evaluation