<a href="https://colab.research.google.com/github/VickkiMars/NLP_Mastery/blob/main/Sarcasm_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00

**Sarcasm** is a form of irony in which a person says or writes the opposite of what they mean, often with a mocking or contemptuous tone. Sarcasm is often used to express humor, criticism, or disapproval. For example, if someone says "Oh, great!" after dropping their ice cream, they are probably being sarcastic.

This project aims to detect sarcasm in given pieces of text. It aims at determining the best tokenization type for sarcasm detection.

Tokenization styles that will be used include: **BPE**, **WordPiece**, **SentencePiece**

In [166]:
from datasets import load_dataset

ds = load_dataset("nikesh66/Sarcasm-dataset", split='train')

In [167]:
ds

Dataset({
    features: ['Tweet', 'Sarcasm (yes/no)'],
    num_rows: 5000
})

# Text Preprocessing

In [168]:
ds = ds.train_test_split(test_size=0.2)

In [169]:
train, test = ds['train'], ds['test']

Finding Null Values

In [170]:
train['Sarcasm (yes/no)'][0] == 'no'

False

In [171]:
def find_null(data):
  null_found = False
  for i in range(len(data)):
    if bool(train[i]['Tweet']) == False:
      print(f"Data: {data}\nEntry with index: {i} is null")
      null_found = True
  if not null_found:
    print(f"NO NULL VALUES FOUND FOR {data.__str__()}\n")

find_null(train);find_null(test)

NO NULL VALUES FOUND FOR Dataset({
    features: ['Tweet', 'Sarcasm (yes/no)'],
    num_rows: 4000
})

NO NULL VALUES FOUND FOR Dataset({
    features: ['Tweet', 'Sarcasm (yes/no)'],
    num_rows: 1000
})



# Convert the datasets to Tensors

In [193]:
train_tweets, train_labels = train['Tweet'], train['Sarcasm (yes/no)']
test_tweets, test_labels = test['Tweet'], test['Sarcasm (yes/no)']

In [194]:
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")

# Fit the tokenizer on the texts
tokenizer.fit_on_texts(train_tweets)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(train_tweets)

# Word index (each word is mapped to an integer)
word_index = tokenizer.word_index

print("Word Index:", word_index)
print("Sequences:", sequences)


Word Index: {'<OOV>': 1, 'i': 2, 'the': 3, 'for': 4, 'are': 5, 'a': 6, 'great': 7, 'genuinely': 8, "can't": 9, 'wait': 10, 'more': 11, 'of': 12, 'love': 13, 'just': 14, 'better': 15, 'always': 16, 'had': 17, 'ever': 18, 'wonderful': 19, 'this': 20, 'am': 21, 'excited': 22, 'artists': 23, 'amazing': 24, 'concert': 25, 'doctors': 26, 'musicians': 27, 'engineers': 28, 'vegetarians': 29, 'meal': 30, 'when': 31, 'happens': 32, 'athletes': 33, 'because': 34, 'really': 35, 'make': 36, 'everything': 37, 'best': 38, 'experience': 39, 'not': 40, 'everyone': 41, 'loves': 42, 'right': 43, 'oh': 44, 'another': 45, 'what': 46, 'needed': 47, 'scientists': 48, 'nothing': 49, 'than': 50, 'moment': 51, 'time': 52, 'with': 53, 'game': 54, 'truly': 55, 'inspiring': 56, 'conference': 57, 'workshop': 58, 'movie': 59, 'find': 60, 'to': 61, 'be': 62, 'so': 63, 'said': 64, 'no': 65, 'one': 66, 'book': 67, 'my': 68, 'new': 69, 'phone': 70, 'writers': 71, 'have': 72, 'been': 73}
Sequences: [[41, 42, 3, 25, 43], 

In [195]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [196]:
train_sequences = tokenizer.texts_to_sequences(train_tweets)
train_padded = pad_sequences(train_sequences, maxlen=53, padding='post')

## Tokenize using SentencePiece Tokenization

In [197]:
from keras.layers import Embedding, LSTM, Dense, Dropout

In [198]:
@tf.keras.utils.register_keras_serializable()
class SarcasmDetectionModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, max_length, **kwargs): # Accept **kwargs
        super(SarcasmDetectionModel, self).__init__(**kwargs) # Pass **kwargs to super
        # Layers
        self.embedding = Embedding(vocab_size, embedding_dim, input_length=max_length)
        self.lstm = LSTM(64, return_sequences=False)
        self.dense1 = Dense(32, activation='relu')
        self.dropout = Dropout(0.2)
        self.output_layer = Dense(1, activation='sigmoid')
        self.max_length = max_length

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.lstm(x)
        x = self.dense1(x)
        x = self.dropout(x)
        return self.output_layer(x)

    def get_config(self):  # Add get_config method
        config = super(SarcasmDetectionModel, self).get_config()
        config.update({
            'vocab_size': self.embedding.input_dim,
            'embedding_dim': self.embedding.output_dim,
            'max_length': self.max_length,
        })
        return config


    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [199]:
sarcasm_model = SarcasmDetectionModel(vocab_size=5000, embedding_dim=128, max_length=53)

In [200]:
import numpy as np

In [201]:
train_labels = [0 if label == 'no' else 1 for label in train_labels]
test_labels = [0 if label == 'no' else 1 for label in test_labels]

train_labels, test_labels = np.array(train_labels), np.array(test_labels)

In [202]:
sarcasm_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
sarcasm_model.fit(train_padded, train_labels, epochs=20)


Epoch 1/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.4994 - loss: 0.6940
Epoch 2/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4917 - loss: 0.6936
Epoch 3/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.4978 - loss: 0.6936
Epoch 4/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.5132 - loss: 0.6934
Epoch 5/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5106 - loss: 0.6935
Epoch 6/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5118 - loss: 0.6930
Epoch 7/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.4754 - loss: 0.6932
Epoch 8/20
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.5013 - loss: 0.6932
Epoch 9/20
[1m125/125[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7eef847840a0>

In [186]:
test_sequences = tokenizer.texts_to_sequences(test_tweets)
test_padded = pad_sequences(test_sequences, maxlen=53, padding='post')

# Use the model for prediction
prediction = sarcasm_model.predict(test_padded)

import numpy as np
from sklearn.metrics import accuracy_score


predicted_labels = (prediction > 0.5).astype(int)

accuracy = accuracy_score(test_labels, predicted_labels)

# Print accuracy
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Test Accuracy: 100.00%


In [187]:
pred1 = "Oh, great! Another meeting that could have been an email. My productivity is really going to skyrocket now."
pred = "i am a boy"
pred = tokenizer.texts_to_sequences([pred])
pred = pad_sequences(pred, maxlen=53, padding='post')
sarcasm_model.predict(pred)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step


array([[0.99999964]], dtype=float32)