# SVM & Random Forest
## Sarcasm Detection

# Installation & Download

In [None]:
!wget https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!dpkg -i cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
!ls /var/cuda-repo-9-0-local | grep .pub
!apt-key add /var/cuda-repo-9-0-local/7fa2af80.pub
!apt-get update
!sudo apt-get install cuda-9.0
!nvcc --version

--2022-05-31 04:12:02--  https://developer.nvidia.com/compute/cuda/9.0/Prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
Resolving developer.nvidia.com (developer.nvidia.com)... 152.195.19.142
Connecting to developer.nvidia.com (developer.nvidia.com)|152.195.19.142|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://developer.nvidia.com/compute/cuda/9.0/prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb [following]
--2022-05-31 04:12:03--  https://developer.nvidia.com/compute/cuda/9.0/prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64-deb
Reusing existing connection to developer.nvidia.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://developer.download.nvidia.com/compute/cuda/9.0/secure/Prod/local_installers/cuda-repo-ubuntu1704-9-0-local_9.0.176-1_amd64.deb?_jlXrqrMOnBCDqzrCRrK_pLs5mavg2FUwzCzTIOZRozOGJD8KdsOUWsnJidkbWep_83NMR-24ZGkWgG-mdCV9UQRfLtPX

In [None]:
!pip install thundersvm
!pip install beautifulsoup4

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting thundersvm
  Downloading thundersvm-0.3.12-py3-none-any.whl (507 kB)
[K     |████████████████████████████████| 507 kB 7.3 MB/s 
Installing collected packages: thundersvm
Successfully installed thundersvm-0.3.12


In [None]:
try:
  from google.colab import drive
  IN_COLAB = True
except:
  IN_COLAB = False

In [None]:
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import tqdm
import torch
import random
from sklearn import model_selection, feature_extraction
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Source https://nlp.cs.princeton.edu/SARC/
!wget -nc https://nlp.cs.princeton.edu/SARC/0.0/key.csv
!wget -nc https://nlp.cs.princeton.edu/SARC/0.0/main/test-balanced.csv.bz2
!wget -nc https://nlp.cs.princeton.edu/SARC/0.0/main/train-balanced.csv.bz2
!bzip2 -dk test-balanced.csv.bz2
!bzip2 -dk train-balanced.csv.bz2

--2022-05-25 14:13:47--  https://nlp.cs.princeton.edu/SARC/0.0/key.csv
Resolving nlp.cs.princeton.edu (nlp.cs.princeton.edu)... 128.112.136.61
Connecting to nlp.cs.princeton.edu (nlp.cs.princeton.edu)|128.112.136.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80 [text/csv]
Saving to: ‘key.csv’


2022-05-25 14:13:47 (7.22 MB/s) - ‘key.csv’ saved [80/80]

--2022-05-25 14:13:47--  https://nlp.cs.princeton.edu/SARC/0.0/main/test-balanced.csv.bz2
Resolving nlp.cs.princeton.edu (nlp.cs.princeton.edu)... 128.112.136.61
Connecting to nlp.cs.princeton.edu (nlp.cs.princeton.edu)|128.112.136.61|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20261578 (19M) [application/x-bzip2]
Saving to: ‘test-balanced.csv.bz2’


2022-05-25 14:13:48 (45.9 MB/s) - ‘test-balanced.csv.bz2’ saved [20261578/20261578]

--2022-05-25 14:13:48--  https://nlp.cs.princeton.edu/SARC/0.0/main/train-balanced.csv.bz2
Resolving nlp.cs.princeton.edu (nlp.cs.princeton.edu)..

# Load Data Set
Citation: https://medium.com/@therpsvishal/sarcasm-detection-on-reddit-data-4b399df855ad

In [None]:
# Get Train Data
header_names = pd.read_csv('key.csv', sep='\t').columns.values.tolist()
train = pd.read_csv('train-balanced.csv', sep='\t', names=header_names)
test = pd.read_csv('test-balanced.csv', sep='\t', names=header_names)

In [None]:
# Reorder and combine comment data
train['author'] = train['author'].astype(str)
train['comment'] = train['comment'].astype(str)

train_group = train.groupby('author')['comment'].agg(lambda col: ' '.join(col))

new_col = pd.DataFrame(train_group)
new_col['joined_comment'] = new_col['comment']
new_col = new_col.drop(['comment'], axis=1)

train_df = train.merge(new_col,how='left',on='author')

In [None]:
# Set data limit and simplify data to just the joined_comment and label
limit = 50000

# train_df has the following labels: [label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,joined_comment]
# Optional: Can use "joined_comment" as the primary data point. Train_opt_df has only the label and joined_comment
train_opt_df = pd.concat([train_df['joined_comment'],train_df['label']],axis=1)
vectorizer = feature_extraction.text.TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_opt_df[0:limit]["joined_comment"])

train_opt_df

Unnamed: 0,joined_comment,label
0,NC and NH. Very surprising!,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools. Jimmy Graham p...,0
...,...,...
29995,*speckled But there's futuristic robot surgica...,0
29996,There will be jobs........building a wall. We ...,1
29997,lol.. no. We are talking about Bone7 tho...,0
29998,"Yes, but it can get better too. Because everyo...",0


# Pre-processing

In [None]:
from bs4 import BeautifulSoup
import re
import string
from collections import Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS

In [None]:
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
"""
def preprocess(row):
    new_text = []
    text = row['comment']
    for t in text or []:
      if t is not None:
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return {**row, 'comment':" ".join(new_text)}

def cleanup(d_set):
  return d_set.map(preprocess).remove_columns(['author','subreddit', 'score', 'ups', 'downs', 'date', 'created_utc'])
"""
def clean_data(text):
  final_text = []

  html_parser = BeautifulSoup(text, "html.parser")
  bracket_remove = re.sub('\[[^]]*\]', '', html_parser.get_text())
  url_remove = re.sub(r'http\S+', '', bracket_remove)

  for i in url_remove.split():
    if i.strip().lower() not in stop and i.strip().lower().isalpha():
      final_text.append(i.strip().lower())

  return " ".join(final_text)

def generate_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words

def word_count(trained_corpus):
  word_count = Counter(trained_corpus)
  common_words = word_count.most_common(20)
  common_word_count = dict(common_words)
  return common_word_count

In [None]:
df = train_opt_df

df['joined_comment']=df['joined_comment'].apply(clean_data)

train_corpus = generate_corpus(df.joined_comment)
word_counter = word_count(train_corpus)

In [None]:
df

In [None]:
word_counter

# Evaluation

In [None]:
# F1 Scoring Function using a Count Vectorizer
# Does 5-fold train+validation
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}
def getScores(clf,train_vectors,train):
    # Get Scores
    scores = model_selection.cross_validate(clf, train_vectors, train["label"], cv=5, scoring=scoring)
    return scores

## Random Forest Algorithm

*RF* on reduced size (40% of final implementation) data points


In [None]:
from sklearn.ensemble import RandomForestClassifier
#max_features = 'sqrt', criterion = 'gini'
model_rf = RandomForestClassifier()

s = getScores(model_rf,train_vectors,train_opt_df[0:limit])
s

{'fit_time': array([374.12144113, 425.84100509, 441.339849  , 382.46513867,
        455.09445262]),
 'score_time': array([0.61582208, 0.56675601, 0.59817982, 0.58158517, 0.59956574]),
 'test_accuracy': array([0.54425, 0.579  , 0.56475, 0.57475, 0.5885 ]),
 'test_f1_score': array([0.10681039, 0.08078603, 0.05328983, 0.11544462, 0.11027027]),
 'test_precision': array([0.22199593, 0.26241135, 0.16955017, 0.2983871 , 0.34113712]),
 'test_recall': array([0.07032258, 0.04774194, 0.0316129 , 0.07156673, 0.06576402])}

In [None]:
#max_features = 'log2'
model_rf = RandomForestClassifier(max_features='log2')

s = getScores(model_rf,train_vectors,train_opt_df[0:limit])
s

{'fit_time': array([399.1799233 , 463.63591337, 482.82389998, 439.93204737,
        487.31466579]),
 'score_time': array([1.07830477, 1.10223842, 1.11953115, 1.05199337, 1.16634083]),
 'test_accuracy': array([0.54525, 0.57125, 0.56175, 0.57325, 0.58775]),
 'test_f1_score': array([0.09995052, 0.0754717 , 0.06206528, 0.10954617, 0.11865313]),
 'test_precision': array([0.21443737, 0.2295082 , 0.18181818, 0.28688525, 0.346875  ]),
 'test_recall': array([0.06516129, 0.04516129, 0.03741935, 0.06769826, 0.07156673])}

In [None]:
#criterion = 'entropy'
model_rf = RandomForestClassifier(criterion = 'entropy')

s = getScores(model_rf,train_vectors,train_opt_df[0:limit])
s

{'fit_time': array([353.52123308, 419.28753066, 430.28836989, 366.72966218,
        435.80368781]),
 'score_time': array([0.58800983, 0.56543684, 0.61263895, 0.57790208, 0.62720323]),
 'test_accuracy': array([0.54325, 0.56925, 0.5635 , 0.57175, 0.587  ]),
 'test_f1_score': array([0.10834553, 0.06814494, 0.05621622, 0.11381273, 0.11752137]),
 'test_precision': array([0.22244489, 0.21070234, 0.17333333, 0.28795812, 0.34267913]),
 'test_recall': array([0.0716129 , 0.04064516, 0.03354839, 0.07092199, 0.07092199])}

RF on 50,000 data points


In [None]:
model_rf = RandomForestClassifier()

s = getScores(model_rf,train_vectors,train_opt_df[0:limit])
s

{'fit_time': array([2716.09853077, 2538.54384804, 2443.86491704, 2526.4006083 ,
        2491.0191412 ]),
 'score_time': array([1.85987735, 1.41094112, 1.48795342, 1.4731493 , 1.5818429 ]),
 'test_accuracy': array([0.5275, 0.5329, 0.5252, 0.5298, 0.5284]),
 'test_f1_score': array([0.1921696 , 0.18210471, 0.17166783, 0.17247448, 0.18011127]),
 'test_precision': array([0.32750583, 0.32953105, 0.30769231, 0.31633312, 0.31995059]),
 'test_recall': array([0.13597871, 0.1258166 , 0.11904186, 0.11855795, 0.12533269])}

## Support Vector Machines (SVM) Classification

In [None]:
from thundersvm import SVC

ThunderSVM on GPU w/ All Data Points, Preprocessed and with Optimal C and Gamma Values (values tested in [0.01,0.1,1,10,100])

C = 10
Gamma = 0.1

In [None]:
clf = SVC(max_mem_size=1024*3,max_iter=3,C=10,gamma=0.1)

s = getScores(clf,train_vectors,train_opt_df)
s

{'fit_time': array([7.59845471, 7.43085265, 7.49992967, 7.24318337, 6.88702154]),
 'score_time': array([111.67381978, 111.04415989, 111.40876818, 111.59996176,
        112.56222725]),
 'test_accuracy': array([0.48023901, 0.50608661, 0.49915663, 0.50402394, 0.52664408]),
 'test_f1_score': array([0.55211628, 0.2981514 , 0.29323691, 0.28728009, 0.5258396 ]),
 'test_precision': array([0.48504048, 0.51493153, 0.49797302, 0.51027726, 0.52673741]),
 'test_recall': array([0.64072099, 0.20981975, 0.20780159, 0.19991492, 0.52494485])}