# CS6140 Project - Detection Of Sarcasm In Text

## 1. Data Preprocessing

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
from gensim.models import Word2Vec

In [65]:
# Run these if not up to date
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [66]:
data = pd.read_csv("train-balanced-sarcasm.csv")

In [67]:
data = data[['label', 'comment', 'subreddit', 'score', 'parent_comment']]

In [68]:
data.head(10)

Unnamed: 0,label,comment,subreddit,score,parent_comment
0,0,NC and NH.,politics,2,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,deadass don't kill my buzz
4,0,I could use one of those tools.,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...
5,0,"I don't pay attention to her, but as long as s...",AskReddit,0,do you find ariana grande sexy ?
6,0,Trick or treating in general is just weird...,AskReddit,1,What's your weird or unsettling Trick or Treat...
7,0,Blade Mastery+Masamune or GTFO!,FFBraveExvius,2,Probably Sephiroth. I refuse to taint his grea...
8,0,"You don't have to, you have a good build, buy ...",pcmasterrace,1,What to upgrade? I have $500 to spend (mainly ...
9,0,I would love to see him at lolla.,Lollapalooza,2,Probably count Kanye out Since the rest of his...


In [69]:
data.isna().sum()

label              0
comment           53
subreddit          0
score              0
parent_comment     0
dtype: int64

In [70]:
data["combined"] = data["parent_comment"] + " " + data["comment"]

In [71]:
data = data.dropna(axis=0)

In [72]:
data.isna().sum()

label             0
comment           0
subreddit         0
score             0
parent_comment    0
combined          0
dtype: int64

In [73]:
data.head(10)

Unnamed: 0,label,comment,subreddit,score,parent_comment,combined
0,0,NC and NH.,politics,2,"Yeah, I get that argument. At this point, I'd ...","Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,nba,-4,The blazers and Mavericks (The wests 5 and 6 s...,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",nfl,3,They're favored to win.,They're favored to win. They were underdogs ea...
3,0,"This meme isn't funny none of the ""new york ni...",BlackPeopleTwitter,-8,deadass don't kill my buzz,deadass don't kill my buzz This meme isn't fun...
4,0,I could use one of those tools.,MaddenUltimateTeam,6,Yep can confirm I saw the tool they use for th...,Yep can confirm I saw the tool they use for th...
5,0,"I don't pay attention to her, but as long as s...",AskReddit,0,do you find ariana grande sexy ?,do you find ariana grande sexy ? I don't pay a...
6,0,Trick or treating in general is just weird...,AskReddit,1,What's your weird or unsettling Trick or Treat...,What's your weird or unsettling Trick or Treat...
7,0,Blade Mastery+Masamune or GTFO!,FFBraveExvius,2,Probably Sephiroth. I refuse to taint his grea...,Probably Sephiroth. I refuse to taint his grea...
8,0,"You don't have to, you have a good build, buy ...",pcmasterrace,1,What to upgrade? I have $500 to spend (mainly ...,What to upgrade? I have $500 to spend (mainly ...
9,0,I would love to see him at lolla.,Lollapalooza,2,Probably count Kanye out Since the rest of his...,Probably count Kanye out Since the rest of his...


In [74]:
data['combined'] = data['combined'].str.lower()
data['combined'] = data['combined'].str.replace('can\'t','can not',regex = True)
data['combined'] = data['combined'].str.replace('\'d',' would',regex = True)
data['combined'] = data['combined'].str.replace('\'re',' are',regex = True)
data['combined'] = data['combined'].str.replace('wouldn\'t','would not',regex = True)
data['combined'] = data['combined'].str.replace('couldn\'t','could not',regex = True)
data['combined'] = data['combined'].str.replace('[^\w\s]','', regex = True)


In [75]:
lemma = WordNetLemmatizer()

In [76]:
def apply_lemmatizer(sentence):
    return [lemma.lemmatize(token) for token in wordpunct_tokenize(sentence)]

data["Clean_tokens"] = data.combined.apply(apply_lemmatizer)

In [79]:
data["Clean_tokens"][0]

['yeah',
 'i',
 'get',
 'that',
 'argument',
 'at',
 'this',
 'point',
 'id',
 'prefer',
 'is',
 'she',
 'lived',
 'in',
 'nc',
 'a',
 'well',
 'nc',
 'and',
 'nh']

In [80]:
tokenized_sentences = data['Clean_tokens'].tolist()

In [81]:
tokenized_sentences[0:3]

[['yeah',
  'i',
  'get',
  'that',
  'argument',
  'at',
  'this',
  'point',
  'id',
  'prefer',
  'is',
  'she',
  'lived',
  'in',
  'nc',
  'a',
  'well',
  'nc',
  'and',
  'nh'],
 ['the',
  'blazer',
  'and',
  'maverick',
  'the',
  'west',
  '5',
  'and',
  '6',
  'seed',
  'did',
  'not',
  'even',
  'carry',
  'a',
  'good',
  'enough',
  'record',
  'to',
  'make',
  'the',
  'playoff',
  'in',
  'the',
  'east',
  'last',
  'year',
  'you',
  'do',
  'know',
  'west',
  'team',
  'play',
  'against',
  'west',
  'team',
  'more',
  'than',
  'east',
  'team',
  'right'],
 ['theyre',
  'favored',
  'to',
  'win',
  'they',
  'were',
  'underdog',
  'earlier',
  'today',
  'but',
  'since',
  'gronks',
  'announcement',
  'this',
  'afternoon',
  'the',
  'vega',
  'line',
  'ha',
  'moved',
  'to',
  'patriot',
  '1']]

## 2. Models

## 3. Evaluation