# Sarcasm Detection
 **Acknowledgement**

Misra, Rishabh, and Prahal Arora. "Sarcasm Detection using Hybrid Neural Network." arXiv preprint arXiv:1908.07414 (2019).

**Required Files given in below link.**

https://drive.google.com/drive/folders/1xUnF35naPGU63xwRDVGc-DkZ3M8V5mMk

## Install `Tensorflow2.0` 

In [None]:
#!!pip uninstall tensorflow
#!pip install tensorflow==2.0.0

## Get Required Files from Drive

In [None]:
from google.colab import drive
drive.mount('4/ygFZvHWGr1TPWtpCFD73vGK4xAJSLpF6CnbWZjOuBHggP8-4y4ZZZHo/content/drive/')

In [0]:
#Set your project path 
project_path =  4/ygFZvHWGr1TPWtpCFD73vGK4xAJSLpF6CnbWZjOuBHggP8-4y4ZZZHo

#**## Reading and Exploring Data**

## Read Data "Sarcasm_Headlines_Dataset.json". Explore the data and get  some insights about the data. ( 8 marks)
Hint - As its in json format you need to use pandas.read_json function. Give paraemeter lines = True.

In [None]:
print("***********************************************************")
print("**************  Loading Dataset and Analysis **************")
print("***********************************************************")

import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, precision_score, recall_score, f1_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras import layers
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    
def plot_roc(model, X_test, y_test):
    proba = model.predict_proba(X_test)
    fpr,tpr, threshold = roc_curve(y_test,proba)
    auc_val = auc(fpr,tpr)

    plt.figure(figsize=(14,8))
    plt.title('Reciever Operating Charactaristics')
    plt.plot(fpr,tpr,'b',label = 'AUC = %0.2f' % auc_val)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.ylabel('True positive rate')
    plt.xlabel('False positive rate')
    
#This is a function that reads a pre-trained embeddings file and returns a matrix embeddings for the dataset we are working with.
#Inputs are the filepath, the size of the embeddings (should match the pre-trained ones) and the word_indices as created by a tokenizer on our data
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, encoding="utf8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [None]:
print("***********************************************************")
print("**************  Loading Dataset and Analysis **************")
print("***********************************************************")

In [2]:
import json
import pandas as pd
dataset = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
dataset.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [None]:
print("***************************************************************")
print("**************  Buliding functions for Data Cleaning **********")
print("***************************************************************")

Importing required libraries

In [None]:
### Remove Punctuations and change words to lower case
def remove_punctuations(text):    
    words=[word.lower() for word in text.split()] 
    words=[w for word in words for w in re.sub(r'[^\w\s]','',word).split()]    
    return words

### Remove StopWords
stop = set(stopwords.words('english'))
def remove_stopwords(text):
    modified_word_list=[word for word in text if word not in stop]
    return modified_word_list

### Stemming of Words
from nltk.stem.porter import PorterStemmer
st=PorterStemmer()
def Stemming(text):
    stemmed_words=[st.stem(word) for word in text] 
    return stemmed_words

### Recreating the sentence
def Recreate(text):
    word=" ".join(text)
    return word

def Cleaning(text):
    text_punctuation_removed=remove_punctuations(text)
    text_stopword_removed=remove_stopwords(text_punctuation_removed)
    # text_stemmed=Stemming(text_stopword_removed)
    final_text=Recreate(text_stopword_removed)
    return final_text

In [None]:
print("*******************************************************")
print("**************  SentimentIntensityAnalyzer ************")
print("*******************************************************")

In [None]:
vader_model = SentimentIntensityAnalyzer()
nlp = spacy.load('en')

def sentiment_classification(scores):
    compound_score = scores['compound']
    if compound_score >= 0.05:
        sentiment = 'positive'
    elif compound_score <= - 0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    return sentiment

In [None]:
sentences = []
for i in range(0, len(dataset)):
    sentences.append(dataset['headline'][i])

In [None]:
system_output= []
for sent in sentences[:10]:
    scores = vader_model.polarity_scores(sent)
    system_output.append(sentiment_classification(scores))
    print()
    print('Input sentence:', sent)
    print('Vader output:', scores)

Input sentence: former versace store clerk sues over secret 'black code' for minority shoppers
Vader output: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Input sentence: the 'roseanne' revival catches up to our thorny political mood, for better and worse
Vader output: {'neg': 0.272, 'neu': 0.576, 'pos': 0.152, 'compound': -0.3182}

Input sentence: mom starting to fear son's web series closest thing she will have to grandchild
Vader output: {'neg': 0.198, 'neu': 0.802, 'pos': 0.0, 'compound': -0.4939}

Input sentence: boehner just wants wife to listen, not come up with alternative debt-reduction ideas
Vader output: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Input sentence: j.k. rowling wishes snape happy birthday in the most magical way
Vader output: {'neg': 0.0, 'neu': 0.629, 'pos': 0.371, 'compound': 0.6486}

Input sentence: advancing the world's women
Vader output: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Input sentence: the fascinating case for eating lab-grown meat
Vader output: {'neg': 0.0, 'neu': 0.632, 'pos': 0.368, 'compound': 0.5423}

Input sentence: this ceo will send your kids to school, if you work for his company
Vader output: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Input sentence: top snake handler leaves sinking huckabee campaign
Vader output: {'neg': 0.0, 'neu': 0.769, 'pos': 0.231, 'compound': 0.2023}

Input sentence: friday's morning email: inside trump's presser for the ages
Vader output: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [None]:
num_huffington = 0
num_onion = 0
num_sarcastic = 0
num_non_sarcastic = 0
for i in range(0, len(dataset)):
    if "huffingtonpost" in dataset['article_link'][i]:
        num_huffington+=1
    if "theonion" in dataset['article_link'][i]:
        num_onion+=1
    if dataset['is_sarcastic'][i] == 1:
        num_sarcastic+=1
    else:
        num_non_sarcastic+=1

In [None]:
print("*******************************************************")
print("**************  Insight Of the given Data* ************")
print("*******************************************************")

In [None]:
# data to plot
n_groups = 2
group_news = (num_huffington, num_onion)
group_sarcasm  = (len(dataset)-num_sarcastic-num_non_sarcastic, num_sarcastic)
group_non_sarcasm = (num_non_sarcastic, len(dataset)-num_sarcastic-num_non_sarcastic)

# create plot
fig, ax = plt.subplots()
index = np.arange(n_groups)
bar_width = 0.20
opacity = 0.8
 
rects1 = plt.bar(index, group_news, bar_width,
alpha=opacity,
color='y',
label='Total number of headlines')
 
rects2 = plt.bar(index + bar_width, group_sarcasm , bar_width,
alpha=opacity,
color='c',
label='Is Sarcastic')


rects3 = plt.bar(index + 2*bar_width, group_non_sarcasm , bar_width,
alpha=opacity,
color='m',
label='Is Not Sarcastic')

 
plt.xlabel('Websites')
plt.ylabel('Number of headlines')
plt.title('Number of headlines by websites')
plt.xticks(index + bar_width, ('Huffington', 'The Onion'))
plt.legend()
 
plt.tight_layout()
plt.show()
plt.savefig('Sarcasm_news_data1')

In [None]:
print("*******************************************************")
print("************** Function to Clean the Text  ************")
print("*******************************************************")

In [None]:
def clean_text(text):
    ## Remove puncuation
    text = text.translate(string.punctuation)
    ## Convert words to lower case and split them
    text = text.lower().split()

    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]

    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [None]:
dataset = pd.read_csv('train.txt',sep='\t',header=None)
dataset

Cleaning data and preprocessing..................

In [None]:
sns.countplot(dataset['is_sarcastic'])

In [None]:
dataset['clean_headline'] = dataset['headline'].apply(remove_punctuations)
dataset['clean_headline'] = dataset['clean_headline'].apply(Recreate)
dataset.head()

In [None]:
# Cleaning the text
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
corpus = []
nlp = spacy.load('en')
for i, headline in enumerate(dataset['headline']):
    doc = nlp(headline)
    review = dataset['headline'][i] 
    tokens = [word.lemma_.lower() for word in doc
        if len(word) > 3 and not (word.is_stop | word.is_punct | word.is_digit)]
    review = " ".join(tokens)
    corpus.append(review) # add it to the corpus

In [None]:
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader_model = SentimentIntensityAnalyzer()

In [None]:
sentiments = []
for headline in corpus:
    scores = vader_model.polarity_scores(headline)
    scores_array = [scores['neg'],scores['neu'],scores['pos'],scores['compound']]
    sentiments.append(scores_array)

sentiment_array = np.array([np.array(x) for x in sentiments])

In [None]:
print((sentiment_array.shape))

In [None]:
print("**********************************************")
print("************** Bag of Words Model ************")
print("**********************************************")

In [None]:
# bag of words model
y = dataset.iloc[:, 2]
from sklearn.feature_extraction.text import TfidfVectorizer
features_n = range(2800, 3500, 100) # list of different max vectors to try
scores = []

for i in features_n:
    
    #Td-idf
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3),max_features = i)
    X = tf.fit_transform(corpus).toarray() #get the preprocessed text
    
    #add sentiment scores to feature vector
    X = np.concatenate((X,sentiment_array),axis=1)
   
    #Splitting
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)
    
    #Logistic regression (our main model)
    from sklearn.linear_model import LogisticRegression  
    classifier = LogisticRegression(random_state=0, solver='lbfgs') 
    
    #Multinomial Naive Bayes (only for the last part)
    from sklearn.naive_bayes import MultinomialNB
    #classifier2 = MultinomialNB().fit(X_train, y_train)
    
    classifier.fit(X_train, y_train)
    #classifier2.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    #Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    TP = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    TN = cm[1][1]
    error_rate = (FP+FN)/(TP+TN+FP+FN) # calculating the error rate based on confusion matrix results
    scores.append(error_rate)

In [None]:
print(type(X.shape))

In [None]:
#Printing out the optimal max features value and plot the results
optimal_n = features_n[scores.index(min(scores))]
print ("The optimal number of max vectors is %d" % optimal_n + " with an error rate of %.3f" % min(scores))
plt.plot(features_n, scores)
plt.xlabel('Number of Max Vectors')
plt.ylabel('Error Rate')
plt.show()
#plt.savefig('Sarcasm_Error')

In [None]:
print('Confusion matrix\n',confusion_matrix(y_test, y_pred))

In [None]:
print('Classification_report\n',classification_report(y_test, y_pred))

In [None]:
scores

In [None]:
classes = ['Sarcastic','Not Sarcastic']
plt.rcParams["figure.figsize"] = (7,5)
normalize = True
cm =confusion_matrix(y_test,y_pred)

fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.GnBu)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       # ... and label them with the respective list entries
       xticklabels=classes, yticklabels=classes,
       title= "Sarcasm Outcomes",
       ylabel='True label',
       xlabel='Predicted label')
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.autofmt_xdate()
#plt.savefig('Sarcasm_Confusion_Matrix')

In [None]:
def important_features_per_class(vectorizer,classifier,n=80):
    class_labels = classifier.classes_
    feature_names =vectorizer.get_feature_names()
    topn_class1 = sorted(zip(classifier.feature_count_[0], feature_names),reverse=True)[:n]
    topn_class2 = sorted(zip(classifier.feature_count_[1], feature_names),reverse=True)[:n]
    print("Important words in non-sarcastic documents")
    for coef, feat in topn_class1:
        print(class_labels[0], coef, feat)
    print("-----------------------------------------")
    print("Important words in sarcastic documents")
    for coef, feat in topn_class2:
        print(class_labels[1], coef, feat)

In [None]:
feature_to_coef = {
    word: coef for word, coef in zip(
        tf.get_feature_names(), classifier.coef_[0]
    )
}

for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
print("-----------------------------------------")

for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

In [None]:
sentences = dataset['clean_headline'].values  
y = dataset['is_sarcastic'].values             

In [None]:
#split to train/test
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

#Build simple counts
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

## Drop `article_link` from dataset. ( 4 marks)
As we only need headline text data and is_sarcastic column for this project. We can drop artical link column here.

In [None]:
df.drop(columns = 'article_link', inplace = True)
df.head()

## Get the Length of each line and find the maximum length. ( 8 marks)
As different lines are of different length. We need to pad the our sequences using the max length.

In [0]:
pos = data.TabularDataset(
    path='sarcasm_headlines.txt', format='csv',
    csv_reader_params={'delimiter':"\t"},
    fields=[('text', TEXT),
            ('label', LABEL)])

# Split data into 90/10 training/test
trainandval, test_data=pos.split(split_ratio=0.90,random_state=random.seed(421))

# Of the remaining training data, 80/20 train/validation
train_data, valid_data = trainandval.split(split_ratio=0.80,random_state=random.seed(421))

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,sort_key=lambda x: len(x.text),
    device=device)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, 
                           dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, (hidden, cell) = self.rnn(embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [None]:
#Maximum vocabulary, choose word vectors
TEXT.build_vocab(train_data,max_size=750, vectors="glove.twitter.27B.100d")
LABEL.build_vocab(train_data)
#Network Hyperparameters
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.925

In [None]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

In [None]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.001)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
  #round predictions to the closest integer
  rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (rounded_preds == y).float() #convert into float for division
  acc = correct.sum()/len(correct)
  return acc

In [None]:
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad()
    predictions = model(batch.text).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  with torch.no_grad():
    for batch in iterator:
      predictions = model(batch.text).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

#**## Modelling**

## Import required modules required for modelling.

In [0]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential

## Set Different Parameters for the model. ( 4 marks)

In [0]:
max_features = 10000
maxlen = ## Add your max length here ##
embedding_size = 200

## Apply Keras Tokenizer of headline column of your data.  ( 8 marks)
Hint - First create a tokenizer instance using Tokenizer(num_words=max_features) 
And then fit this tokenizer instance on your data column df['headline'] using .fit_on_texts()

In [None]:
print("**************************************************")
print("************** Apply Keras Tokenizer *************")
print("**************************************************")

In [0]:
from nltk.tokenize import word_tokenize
text_data = []
for sent in corpus:
    doc = word_tokenize(sent)
    text_data.append(doc)

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [None]:
import gensim
NUM_TOPICS = 30
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model1.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(12, '0.230*"trump" + 0.079*"donald" + 0.061*"house" + 0.051*"white"')
(19, '0.056*"open" + 0.046*"office" + 0.036*"teacher" + 0.032*"happen"')
(17, '0.115*"obama" + 0.064*"leave" + 0.027*"explain" + 0.022*"believe"')
(14, '0.063*"star" + 0.050*"girl" + 0.038*"force" + 0.030*"dead"')
(25, '0.044*"give" + 0.043*"self" + 0.031*"candidate" + 0.031*"couple"')
(0, '0.054*"work" + 0.046*"stop" + 0.029*"word" + 0.027*"father"')
(27, '0.063*"want" + 0.059*"come" + 0.050*"tell" + 0.045*"trump"')
(13, '0.049*"million" + 0.035*"food" + 0.030*"sign" + 0.027*"award"')
(28, '0.074*"know" + 0.061*"need" + 0.056*"kill" + 0.036*"bush"')
(1, '0.059*"plan" + 0.039*"fight" + 0.036*"baby" + 0.034*"announce"')
(16, '0.094*"good" + 0.071*"family" + 0.052*"friend" + 0.035*"movie"')
(2, '0.065*"like" + 0.063*"nation" + 0.062*"look" + 0.059*"thing"')
(23, '0.067*"clinton" + 0.045*"health" + 0.038*"hillary" + 0.034*"care"')
(29, '0.044*"fire" + 0.029*"employee" + 0.025*"tweet" + 0.025*"race"')
(7, '0.041*"hope" + 0.039*"die" + 0.032*"mother" + 0.026*"line"')
(10, '0.155*"woman" + 0.091*"area" + 0.056*"change" + 0.026*"climate"')
(21, '0.056*"state" + 0.053*"president" + 0.037*"night" + 0.035*"deal"')
(11, '0.050*"election" + 0.048*"student" + 0.034*"college" + 0.030*"body"')
(24, '0.054*"party" + 0.050*"video" + 0.043*"bill" + 0.032*"bring"')
(3, '0.098*"report" + 0.077*"life" + 0.056*"watch" + 0.051*"make"')

In [None]:
import json
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
def parse_data(file):
    for line in open(file, 'r'):
        # use yield since we are interating to each row
        # yield produce a sequence of values into generator object
        yield json.loads(line)

# turn a generator into a list
data = list(parse_data('Sarcasm_Headlines_Dataset.json'))
    

In [None]:
sentences = []
labels = []
urls = []

for item in data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

#Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and",
             "any", "are", "as", "at", "be", "because", "been", "before", "being", "below",
             "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down",
             "during", "each", "few", "for", "from", "further", "had", "has", "have",
             "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers",
             "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm",
             "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me",
             "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other",
             "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd",
             "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the",
             "their", "theirs", "them", "themselves", "then", "there", "there's", "these",
             "they", "they'd", "they'll", "they're", "they've", "this", "those", "through",
             "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll",
             "we're", "we've", "were", "what", "what's", "when", "when's", "where",
             "where's", "which", "while", "who", "who's", "whom", "why", "why's",
             "with", "would", "you", "you'd", "you'll", "you're", "you've", "your",
             "yours", "yourself", "yourselves"]

for num, sentence in enumerate(sentences):
    sentences[num] = ' '.join([w for w in sentence.strip().split() if not w in stopwords])

In [None]:
# out of vocabulary words token <OOV>
# Tokenizer object used to tokenize sentences
tokenizer = Tokenizer(oov_token="<OOV>")
# Tokenize a list of sentences
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
# unique words
len(word_index)

# Encode a list os sentences to use the tokens
sequences = tokenizer.texts_to_sequences(sentences)

# Add  0s at the end of sequence to match the length of the
# longest seqeunce
padded = pad_sequences(sequences, padding='post')

padded.shape

In [None]:
print("Has been created successfully.............................")

## Define X and y for your model

In [0]:
print("********************************************")
print("************** Define X and y  *************")
print("********************************************")
X = tokenizer.texts_to_sequences(df['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(df['is_sarcastic'])

print("Number of Samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

## Get the Vocabulary size ( 4 marks)
Hint : You can use tokenizer.word_index.

In [None]:
print("*****************************************************")
print("************** Size Od the Vocabulary   *************")
print("*****************************************************")

In [0]:
maxlen = 40

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

#**## Word Embedding**

In [None]:
print("****************************************************")
print("**************  Glove Word Embeddings **************")
print("****************************************************")

In [0]:
glove_file = project_path + "glove.6B.zip"

In [0]:
#Extract Glove embedding zip file
from zipfile import ZipFile
with ZipFile(glove_file, 'r') as z:
  z.extractall()

In [None]:
maxlen = 40

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_train)
X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

# Pad sequences with zeros
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

# Get the Word Embeddings using Embedding file as given below.

In [0]:
EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd



# Create a weight matrix for words in training docs

In [0]:
embedding_matrix = np.zeros((num_words, 200))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

len(embeddings.values())

## Create and Compile your Model  ( 14 marks)
Hint - Use Sequential model instance and then add Embedding layer, Bidirectional(LSTM) layer, then dense and dropout layers as required. 
In the end add a final dense layer with sigmoid activation for binary classification.


In [0]:
### Embedding layer for hint 
## model.add(Embedding(num_words, embedding_size, weights = [embedding_matrix]))
### Bidirectional LSTM layer for hint 
## model.add(Bidirectional(LSTM(128, return_sequences = True)))

Defining a baseline model

In [None]:
from sklearn.linear_model import LogisticRegression

#Try a simple classifier
classifier = LogisticRegression(solver='lbfgs', max_iter=200)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
y_pred = classifier.predict(X_test)
print("Accuracy:", score)
print("Precision: %1.3f" % precision_score(y_test, y_pred))
print("Recall: %1.3f" % recall_score(y_test, y_pred))
print("F1: %1.3f\n" % f1_score(y_test, y_pred))

In [None]:
plot_model(model, to_file='model_1.png')

In [None]:
csv_logger = CSVLogger("model_history_log.csv")

In [None]:
model.fit(X,y,epochs=50,batch_size=20,validation_split=0.3,callbacks=[csv_logger])

Loading Glove pre-trained embeddings

In [None]:
embedding_dim = 50
embedding_matrix = create_embedding_matrix('glove.6B.50d.txt', tokenizer.word_index, embedding_dim)

In [None]:
print("*****************************************")
print("**************   CNN Model **************")
print("*****************************************")

CNN Model

In [None]:
embedding_dim = 50

cnn = Sequential()
cnn.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen,trainable=True))
cnn.add(layers.Conv1D(128, 5, activation='relu'))
cnn.add(layers.GlobalMaxPooling1D())
cnn.add(layers.Dense(64, activation='relu'))
cnn.add(layers.Dropout(0.5))
cnn.add(layers.Dense(32, activation='relu'))
cnn.add(layers.Dropout(0.5))
cnn.add(layers.Dense(1, activation='sigmoid'))
cnn.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
cnn.summary()

In [None]:
history = cnn.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

In [None]:
loss, accuracy = cnn.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: %1.3f" % accuracy)
loss, accuracy = cnn.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  %1.3f" % accuracy)
y_pred = cnn.predict(X_test)
y_pred = [1 if pred > 0.7 else 0 for pred in y_pred]

print("Precision: %1.3f" % precision_score(y_test, y_pred))
print("Recall: %1.3f" % recall_score(y_test, y_pred))
print("F1: %1.3f\n" % f1_score(y_test, y_pred))
plot_history(history)

In [None]:
plot_roc(cnn, X_test, y_test)

In [None]:
print("*****************************************")
print("*****   Bidirectional(LSTM) layer *******")
print("*****************************************")

LSTM

In [None]:
embedding_dim = 50
lstm = Sequential()
lstm.add(layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=maxlen, trainable=True))
lstm.add(layers.Bidirectional(layers.LSTM(16, return_sequences=True, recurrent_dropout=0.1, dropout=0.1)))
lstm.add(layers.Bidirectional(layers.LSTM(32, recurrent_dropout=0.1, dropout=0.1)))
lstm.add(layers.Dense(1, activation='sigmoid')) 
lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(lstm.summary())

history = lstm.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=64)

In [None]:
loss, accuracy = lstm.evaluate(X_train, y_train, verbose=False)
y_pred = lstm.predict(X_test)
y_pred = [1 if pred > 0.7 else 0 for pred in y_pred]

print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
print("Precision: %1.3f" % precision_score(y_test, y_pred))
print("Recall: %1.3f" % recall_score(y_test, y_pred))
print("F1: %1.3f\n" % f1_score(y_test, y_pred))
plot_history(history)

In [None]:
plot_roc(lstm, X_test, y_test)

In [None]:
print("**************")
print("Testing models")
print("**************")

In [None]:
model = lstm

class Session:
    def prepareInput(self, text):
        return pad_sequences(tokenizer.texts_to_sequences([text]), padding='post', maxlen=maxlen)
    
    def predict(self, text):
        probability = model.predict(self.prepareInput(text))[0][0]
        return "Sarcastic {}".format(probability) if probability > 0.7 else "Not sarcastic {}".format(probability)
    
    def reply(self, text):
        return self.predict(text)

In [None]:
session = Session()

print ('[sarcasmBot]: Hi! Write something sarcastic, or not?')
print ('[sarcasmBot]: Type \'end\' to exit.')

inp=""
while inp!="end":
    inp = input('[User]: ')
    print ('[sarcasmBot]:', session.reply(inp))
    print ('[sarcasmBot]: write more sarcastic stuff, or not?')

## Fit your model with a batch size of 100 and validation_split = 0.2. and state the validation accuracy ( 10 marks)


In [None]:
print("*************************************")
print("*****  The validation accuracy ******")
print("*************************************")

In [0]:
batch_size = 100
epochs = 5

In [None]:
model=Model(inputs=[inp,af],output=predictions)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit([X,senti_data],[y],epochs,validation_split=0.2,batch_size,callbacks=[csv_logger])

In [None]:
from keras.utils import plot_model
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'
plot_model(model, to_file='model_3.png')

In [None]:
N_EPOCHS=100
bestmodelvalue=0
for epoch in range(N_EPOCHS):
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  if valid_acc >= bestmodelvalue:
    torch.save(model.state_dict(), "sarcasm_detect_model.pt")
    bestmodelvalue=valid_acc
  print(f'Epoch: {epoch+1:02} | Train Acc: {train_acc*100:.2f}% Val. Acc: {valid_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load("sarcasm_detect_model.pt"))
model.eval()
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print("Test Accuracy: ",test_acc)