<a href="https://colab.research.google.com/github/Vishal35198/Sarcasam-detection/blob/main/Sarcasm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [135]:
import pandas as pd
import numpy as np

In [136]:
!kaggle datasets download saurabhbagchi/sarcasm-detection-through-nlp

Dataset URL: https://www.kaggle.com/datasets/saurabhbagchi/sarcasm-detection-through-nlp
License(s): CC0-1.0
sarcasm-detection-through-nlp.zip: Skipping, found more recently modified local copy (use --force to force download)


In [137]:
import zipfile
zip_ref = zipfile.ZipFile('/content/sarcasm-detection-through-nlp.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [138]:
df = pd.read_json('/content/Sarcasm_Headlines_Dataset.json', lines=True)

In [139]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [140]:
df.dropna(inplace=True)

In [141]:
df.shape

(26709, 3)

In [142]:
df.drop('article_link', axis=1, inplace=True)

In [143]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [144]:
(df['is_sarcastic'].value_counts()/len(df))*100

is_sarcastic
0    56.104684
1    43.895316
Name: count, dtype: float64

# NLP WORK

Import the Dependencies

In [145]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
import matplotlib.pyplot as plt

# Data Preparation

In [146]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [147]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    text = text.strip()
    tokens = word_tokenize(text.lower())
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [148]:
df['processed_text'] = df['headline'].apply(preprocess_text)
df.head()

Unnamed: 0,headline,is_sarcastic,processed_text
0,former versace store clerk sues over secret 'b...,0,"[former, versace, store, clerk, sues, secret, ..."
1,the 'roseanne' revival catches up to our thorn...,0,"[roseanne, revival, catches, thorny, political..."
2,mom starting to fear son's web series closest ...,1,"[mom, starting, fear, sons, web, series, close..."
3,"boehner just wants wife to listen, not come up...",1,"[boehner, wants, wife, listen, come, alternati..."
4,j.k. rowling wishes snape happy birthday in th...,0,"[jk, rowling, wishes, snape, happy, birthday, ..."


##Build a Vocabulary

In [149]:
all_words = [word for tokens in df['processed_text'] for word in tokens]
word_counts = Counter(all_words)
vocab = sorted(word_counts,key = word_counts.get , reverse = True)
voacb_to_int = {word: idx+1 for idx, word in enumerate(vocab)}

def convert_text_to_int(text):
  return [voacb_to_int[word] for word in text if word in voacb_to_int]

In [150]:
df['text_to_int'] = df['processed_text'].apply(convert_text_to_int)

In [151]:
df.head(2)

Unnamed: 0,headline,is_sarcastic,processed_text,text_to_int
0,former versace store clerk sues over secret 'b...,0,"[former, versace, store, clerk, sues, secret, ...","[204, 14544, 692, 3436, 2128, 267, 33, 2025, 2..."
1,the 'roseanne' revival catches up to our thorn...,0,"[roseanne, revival, catches, thorny, political...","[8165, 3167, 2570, 8166, 296, 2759, 156, 846]"


# Dataset and DataLoader

In [152]:
class SarcasmDataset(Dataset):
  def __init__(self, texts , labels):
    self.texts = texts
    self.labels = labels

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    # Convert list to LongTensor
    text = torch.tensor(self.texts[idx], dtype=torch.long)
    label = torch.tensor(self.labels[idx], dtype=torch.long)
    return text, label

def pad_sequences(sequences, max_length):
  return sequences + [0] * (max_length - len(sequences))

In [153]:
max_len = max(df['text_to_int'].apply(len))
max_len

27

In [154]:
# pad the sequences using the pad_Sequence function
df['text_to_int'] = df['text_to_int'].apply(lambda x: pad_sequences(x, max_len))

x_train,x_test , y_train , y_test = train_test_split(df['text_to_int'].tolist(), df['is_sarcastic'].tolist(), test_size=0.2, random_state=42)

train_dataset = SarcasmDataset(x_train, y_train)
test_dataset = SarcasmDataset(x_test, y_test)

In [155]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the Model

In [156]:
class RNN(nn.Module):
  def __init__(self, vocab_size , embedding_dim , hidden_dim, output_dim,n_layers,drop_prob = 0.5):
    super(RNN, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_dim,n_layers,batch_first= True , dropout=drop_prob)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(drop_prob)


  def forward(self, x):
    embedded = self.embedding(x)
    rnn_out, _ = self.rnn(embedded)
    output = self.fc(self.dropout(rnn_out[:,-1, :]))
    return output

# intialize the model
vocab_size = len(voacb_to_int) + 1
embedding_dim = 100
hidden_dim = 256
output_dim = 2
n_layers = 2
model = RNN(vocab_size, embedding_dim, hidden_dim, output_dim , n_layers)

In [157]:
for texts ,labels in train_loader:
  print(type(texts))
  print(type(labels))
  break

<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [158]:
df['is_sarcastic'].isin([0, 1]).all(), "Labels must be 0 or 1"

(True, 'Labels must be 0 or 1')

# Training the Model

In [159]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts, labels = texts.long(), labels.long()
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')


Epoch 1, Loss: 0.7093660200605849
Epoch 2, Loss: 0.6980711987275563
Epoch 3, Loss: 0.6999930094280643
Epoch 4, Loss: 0.7021761951510778
Epoch 5, Loss: 0.70031125120774
Epoch 6, Loss: 0.7012205267559268
Epoch 7, Loss: 0.6989493041695235
Epoch 8, Loss: 0.6998784116463747
Epoch 9, Loss: 0.6996780141385016
Epoch 10, Loss: 0.7000466304089494


In [160]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        # Convert lists to LongTensor
        texts, labels = torch.LongTensor(texts), torch.LongTensor(labels)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

print(f'Accuracy: {accuracy_score(all_labels, all_preds)}')
print(classification_report(all_labels, all_preds))


Accuracy: 0.5608386372145264
              precision    recall  f1-score   support

           0       0.56      1.00      0.72      2996
           1       0.00      0.00      0.00      2346

    accuracy                           0.56      5342
   macro avg       0.28      0.50      0.36      5342
weighted avg       0.31      0.56      0.40      5342



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [161]:
torch.save(model.state_dict(), 'sarcasam.pth')

In [179]:
sample = train_dataset[22]
sample

(tensor([18107, 18108,   467,   106,   563,  4970,  2044,  5063,   781,  1327,
           356,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]),
 tensor(0))

In [180]:
output = model(sample[0].unsqueeze(0))
_,preds = torch.max(output, 1)
print(preds.item())
print(preds.item() == sample[1].item())

0
True
