In [1]:
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
import os
import pandas as pd
import numpy as np
from zipfile import ZipFile

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Flatten, Bidirectional, GlobalMaxPool1D, TimeDistributed
from tensorflow.keras.models import Model, Sequential

In [3]:
pwd

'/content'

In [0]:
os.chdir('drive/My Drive/NLP_Academic Project/Data')

In [5]:
pwd

'/content/drive/My Drive/NLP_Academic Project/Data'

In [6]:
!ls

glove.6B.200d.txt  glove.6B.300d.txt  Sarcasm_Headlines_Dataset.json


In [7]:
df = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
df.head(10)

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
5,0,my white inheritance,https://www.huffingtonpost.com/entry/my-white-...
6,0,5 ways to file your taxes with less stress,https://www.huffingtonpost.com/entry/5-ways-to...
7,1,richard branson's global-warming donation near...,https://www.theonion.com/richard-bransons-glob...
8,1,shadow government getting too large to meet in...,https://politics.theonion.com/shadow-governmen...
9,0,lots of parents know this scenario,https://www.huffingtonpost.comhttp://pubx.co/6...


In [8]:
df.drop(columns='article_link',inplace=True)
df.head(10)

# As we do not need this column for sarcasm binary classification, we drop this column inplace from the dataframe

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
5,0,my white inheritance
6,0,5 ways to file your taxes with less stress
7,1,richard branson's global-warming donation near...
8,1,shadow government getting too large to meet in...
9,0,lots of parents know this scenario


In [0]:
df['headline'][0]

'thirtysomething scientists unveil doomsday clock of hair loss'

In [0]:
df['is_sarcastic'].value_counts()

# This is very well balanced dataset, with equal number of classification for sarcasm and not sarcasm

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [0]:
df.shape

(28619, 2)

In [9]:
seq_length_headline = [len(line) for line in df['headline']]
max_seq_headline = max(seq_length_headline)
max_seq_headline

# There is a headline with 926 words, all headline has to be padded with zeros to make the headline length equal for all headline.

926

In [10]:
Zero_Fifty= 0
Fifty_Hundred = 0 
Hundered_OneHundredFifty = 0
Hundered_TwoHundred = 0 
TwoHundred_Above = 0

for length in seq_length_headline:
  if length >0 and length <=50:
    Zero_Fifty+=1
  elif length >50 and length <=100:
    Fifty_Hundred+=1
  elif length >100 and length <=150:
    Hundered_OneHundredFifty+=1
  elif length >150 and length <=200:
    Hundered_TwoHundred+=1
  elif length >200:
    TwoHundred_Above+=1

print(f"Number of headlines with 'Zero-Fifty' characters are {Zero_Fifty}")
print(f"Number of headlines with 'Fifty-Hundred' characters are {Fifty_Hundred}")
print(f"Number of headlines with 'Hundered-OneHundredFifty' characters are {Hundered_OneHundredFifty}")
print(f"Number of headlines with 'Hundered-TwoHundred' characters are {Hundered_TwoHundred}")
print(f"Number of headlines with 'TwoHundredAbove' characters are {TwoHundred_Above}")

Number of headlines with 'Zero-Fifty' characters are 7990
Number of headlines with 'Fifty-Hundred' characters are 19622
Number of headlines with 'Hundered-OneHundredFifty' characters are 993
Number of headlines with 'Hundered-TwoHundred' characters are 8
Number of headlines with 'TwoHundredAbove' characters are 6


In [0]:
max_features = 10000
maxlen = 150 # As only 14 headlines are having greater than 150 characters in them, we set the maxlen as 150
embedding_size = 200

In [0]:
# Do not Run this, as the file has already been extracted

with ZipFile('glove.6B.zip') as gloveEmbeddingFile:
  gloveEmbeddingFile.extractall()

In [0]:
tokenizer = Tokenizer(num_words=max_features,lower=True)

In [0]:
tokenizer.fit_on_texts(df['headline'])

In [14]:
word_to_index_mapping_dict = tokenizer.word_index
index_to_word_mapping_dict = tokenizer.index_word

print(tokenizer.word_counts)
print(word_to_index_mapping_dict)
print(index_to_word_mapping_dict)
print(tokenizer.document_count)


28619


In [15]:
X = tokenizer.texts_to_sequences(df['headline'])
X = pad_sequences(X, maxlen = maxlen)
y = np.asarray(df['is_sarcastic'])

print("Number of Samples:", len(X))
print(X[0])
print("Number of Labels: ", len(y))
print(y[0])

Number of Samples: 28619
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  354 3166 7473 2643    2  660 1118]
Number of Labels:  28619
1


In [0]:
from sklearn.utils import shuffle

In [0]:
# now splitting into test and training data
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test =  train_test_split(X, y,test_size =0.20,random_state= 4 )
X_train,y_train = shuffle(X_train,y_train,random_state=2)
X_test,y_test = shuffle(X_test,y_test,random_state=2)

In [18]:
vocab_size = len(word_to_index_mapping_dict)+1 # Vocab size
print(vocab_size)

30885


In [0]:
EMBEDDING_FILE = './glove.6B.200d.txt'

embeddings = {}
for o in open(EMBEDDING_FILE):
    word = o.split(" ")[0]
    # print(word)
    embd = o.split(" ")[1:]
    embd = np.asarray(embd, dtype='float32')
    # print(embd)
    embeddings[word] = embd

In [0]:
embedding_matrix = np.zeros((vocab_size,embedding_size))

In [0]:
# Creating our own embedding vector using our words present in the glove
for word, i in word_to_index_mapping_dict.items():
	embedding_vector = embeddings.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [22]:
# main model
input = Input(shape=(maxlen,))
model = Embedding(vocab_size,embedding_size,weights=[embedding_matrix],input_length=maxlen)(input)
model =  Bidirectional (LSTM (100,return_sequences=True,dropout=0.50),merge_mode='concat')(model)
model = TimeDistributed(Dense(100,activation='relu'))(model)
model = Flatten()(model)
model = Dense(100,activation='relu')(model)
output = Dense(1,activation='sigmoid')(model)
model = Model(input,output)
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 200)          6177000   
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 200)          240800    
_________________________________________________________________
time_distributed (TimeDistri (None, 150, 100)          20100     
_________________________________________________________________
flatten (Flatten)            (None, 15000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               1500100   
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101   

In [0]:
# model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 10, verbose = 2)

#  we can see the validation loss starts to increase after 4 epochs. So we will keep the epochs as 4

Epoch 1/10
179/179 - 218s - loss: 0.4832 - accuracy: 0.7593 - val_loss: 0.3601 - val_accuracy: 0.8372
Epoch 2/10
179/179 - 221s - loss: 0.3216 - accuracy: 0.8581 - val_loss: 0.3038 - val_accuracy: 0.8657
Epoch 3/10
179/179 - 218s - loss: 0.2542 - accuracy: 0.8935 - val_loss: 0.2933 - val_accuracy: 0.8747
Epoch 4/10
179/179 - 215s - loss: 0.2033 - accuracy: 0.9184 - val_loss: 0.2923 - val_accuracy: 0.8798
Epoch 5/10
179/179 - 218s - loss: 0.1674 - accuracy: 0.9333 - val_loss: 0.2970 - val_accuracy: 0.8809
Epoch 6/10
179/179 - 214s - loss: 0.1395 - accuracy: 0.9444 - val_loss: 0.3178 - val_accuracy: 0.8810
Epoch 7/10
179/179 - 215s - loss: 0.1126 - accuracy: 0.9560 - val_loss: 0.3427 - val_accuracy: 0.8791
Epoch 8/10
179/179 - 217s - loss: 0.0902 - accuracy: 0.9644 - val_loss: 0.3863 - val_accuracy: 0.8789
Epoch 9/10
179/179 - 214s - loss: 0.0783 - accuracy: 0.9704 - val_loss: 0.4405 - val_accuracy: 0.8716
Epoch 10/10
179/179 - 218s - loss: 0.0634 - accuracy: 0.9758 - val_loss: 0.4728 - 

<tensorflow.python.keras.callbacks.History at 0x7f522e03e278>

In [23]:
model.fit(X_train,y_train,validation_data=(X_test, y_test), epochs = 4, verbose = 2)

Epoch 1/4
716/716 - 323s - loss: 0.4289 - accuracy: 0.7957 - val_loss: 0.3182 - val_accuracy: 0.8627
Epoch 2/4
716/716 - 322s - loss: 0.2762 - accuracy: 0.8833 - val_loss: 0.2778 - val_accuracy: 0.8826
Epoch 3/4
716/716 - 321s - loss: 0.2031 - accuracy: 0.9161 - val_loss: 0.2818 - val_accuracy: 0.8835
Epoch 4/4
716/716 - 316s - loss: 0.1529 - accuracy: 0.9388 - val_loss: 0.3154 - val_accuracy: 0.8840


<tensorflow.python.keras.callbacks.History at 0x7f150e4928d0>

In [24]:
from sklearn.metrics import classification_report,confusion_matrix
Y_pred = model.predict(X_test)
print(Y_pred)

[[0.9604064 ]
 [0.0060088 ]
 [0.11559191]
 ...
 [0.7436794 ]
 [0.9970542 ]
 [0.93283486]]


In [0]:
# Just converting predicted sigmoid o/p values greater than 0.5 and less than 0.5 to 1 and 0 respectively
y_pred=[]
for pred in Y_pred:
  pred = pred[0]
  if pred>0.5:
    pred = 1
    y_pred.append(pred)
  else:
    pred=0
    y_pred.append(pred)

In [0]:
temp_list=[]
for act,pred in zip(y_test,y_pred):
  temp_list.append((act,pred))

In [27]:
columns=['y_actual','y_pred']
Predicted_df = pd.DataFrame(temp_list, columns=columns)
Predicted_df.head()

Unnamed: 0,y_actual,y_pred
0,1,1
1,0,0
2,0,0
3,1,1
4,1,0


In [28]:
y_pred = np.array([pred for pred in y_pred])
y_pred

array([1, 0, 0, ..., 1, 1, 1])

In [29]:
y_test

array([1, 0, 0, ..., 1, 1, 1])

In [30]:
print('  Classification Report:\n',classification_report(y_test,y_pred),'\n')

  Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.91      0.89      3023
           1       0.89      0.86      0.87      2701

    accuracy                           0.88      5724
   macro avg       0.88      0.88      0.88      5724
weighted avg       0.88      0.88      0.88      5724
 



In [0]:
# As we can see we have created a model which does not bias for both sarcasm and not sarcasm. And have almost a similar precision, recall and weighted scores for both the classes