<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/Sarcasm%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Connecting to Kaggle

In [1]:
from google.colab import files

files.upload()


! mkdir ~/.kaggle


! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
!kaggle datasets download -d rmisra/news-headlines-dataset-for-sarcasm-detection

Downloading news-headlines-dataset-for-sarcasm-detection.zip to /content
  0% 0.00/3.30M [00:00<?, ?B/s]
100% 3.30M/3.30M [00:00<00:00, 53.8MB/s]


In [3]:
!unzip /content/news-headlines-dataset-for-sarcasm-detection.zip

Archive:  /content/news-headlines-dataset-for-sarcasm-detection.zip
  inflating: Sarcasm_Headlines_Dataset.json  
  inflating: Sarcasm_Headlines_Dataset_v2.json  


### Importing Dependencies

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
dataset = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines = True)

In [6]:
dataset.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [8]:
dataset["headline"][0]

'thirtysomething scientists unveil doomsday clock of hair loss'

### Taking X and Y

In [9]:
X = dataset["headline"]

In [10]:
y = dataset["is_sarcastic"]

### Splitting Data into Train and Test

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [13]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

### Tokenization

In [14]:
vocab_size = 10000
embedding_dim = 16
max_length = 150

In [15]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<oov>")

In [16]:
tokenizer.fit_on_texts(X_train)

In [17]:
print(tokenizer.word_index)



In [18]:
train_sequence = tokenizer.texts_to_sequences(X_train)

In [19]:
train_padded = pad_sequences(train_sequence, maxlen = max_length, truncating = "post")

In [20]:
test_sequence = tokenizer.texts_to_sequences(X_test)

In [21]:
test_padded = pad_sequences(test_sequence, maxlen = max_length)

In [22]:
type(y_train)

numpy.ndarray

### Define Basic Neural Networks

In [23]:
model = tf.keras.models.Sequential([
                                    
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),

        tf.keras.layers.GlobalAveragePooling1D(),

        tf.keras.layers.Dense(512, activation = "relu"),

        tf.keras.layers.Dense(1, activation = "sigmoid")


])

In [24]:
model.compile(loss = "binary_crossentropy", metrics = ["accuracy"], optimizer = "adam")

In [25]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 512)               8704      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 513       
Total params: 169,217
Trainable params: 169,217
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.fit(train_padded, y_train, validation_data = (test_padded, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3f4e374790>

### Classifying the new text

In [27]:
sentences = ["Oh is it?", "Silence is golden. Duct tape is silver."]

In [28]:
predict_sequences = tokenizer.texts_to_sequences(sentences)

In [29]:
padded_seq = pad_sequences(sequences = predict_sequences, maxlen = max_length, truncating = "post")

In [30]:
model.predict(padded_seq)

array([[0.0086363],
       [0.664812 ]], dtype=float32)

### Implementing LSTM

In [31]:
model1 = tf.keras.models.Sequential([
                                     
         tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences = True)),

         tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),

         tf.keras.layers.Dense(64, activation = "relu"),

         tf.keras.layers.Dense(1, activation = "sigmoid")
])

In [32]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 16)           160000    
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 128)          41472     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 308,609
Trainable params: 308,609
Non-trainable params: 0
_________________________________________________________________


In [33]:
model1.compile(loss = "binary_crossentropy", metrics = ["accuracy"], optimizer = "adam")

In [34]:
model1.fit(train_padded, y_train, validation_data = (test_padded, y_test), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3f4df9e050>

In [36]:
model1.predict(padded_seq)

array([[0.00936779],
       [0.2414308 ]], dtype=float32)