In [94]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd

In [95]:
news_df = pd.read_json("Datasets/Sarcasm_Headlines_Dataset.json", lines=True)
twitter_df = pd.read_json("Datasets/data_without_hashtags.json")

In [96]:
news_df.head()
column_name_to_remove = 'article_link'
news_df = news_df.drop(columns=[column_name_to_remove])

news_df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [97]:
twitter_df.head()

Unnamed: 0,text,isSarcastic
0,@0430yes i hope youre lurking rn. i want to li...,0
1,05 really taught me a valuable lesson I'm neve...,0
2,"@098BERRY Never had a voice to protest, so you...",0
3,@0hMySt4rs Rest in peace & love to you and you...,0
4,100 days until Christmas! 🌲 #too soon ready yet,0


In [98]:
news_df = news_df.rename(columns={"is_sarcastic": "isSarcastic"})
news_df = news_df.rename(columns={"headline": "text"})
news_df.head()

Unnamed: 0,text,isSarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [99]:
twitter_df = twitter_df.sample(frac=1).reset_index(drop=True)
twitter_df.head()

Unnamed: 0,text,isSarcastic
0,Nervous as hell for no good reason.,0
1,Manchester and Salford in top five of national...,0
2,sad how some people don't realize the good peo...,0
3,"Wow i just had the best dream ever, it neeeds ...",0
4,Yay school my favorite thing to do with my lif...,1


In [100]:
combined_df = pd.concat([news_df, twitter_df], ignore_index=True)
dataset = combined_df

In [71]:
#undersampling here

# Count the number of instances in each class
class_counts = dataset['isSarcastic'].value_counts()

# Find the class with more items
majority_class = class_counts.idxmax()

# Find the class with fewer items
minority_class = class_counts.idxmin()

# Count the number of instances in the minority class
minority_class_count = class_counts[minority_class]

# Sample the majority class to match the number of instances in the minority class
majority_class_sampled = dataset[dataset['isSarcastic'] == majority_class].sample(n=minority_class_count, random_state=42)

# Concatenate the sampled majority class with the minority class
balanced_data = pd.concat([majority_class_sampled, dataset[dataset['isSarcastic'] == minority_class]])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = balanced_data
# Now, 'balanced_data' contains a balanced dataset where both classes have the same number of instances

sarcastic_counts = dataset['isSarcastic'].value_counts()

# Display the result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    30212
1    30212
Name: count, dtype: int64


In [101]:
import re

def remove_user_mentions(text):
    # Define the regex pattern to match @ mentions followed by numbers
    pattern = re.compile(r'@\d+')

    # Remove @ mentions using the pattern
    return pattern.sub('person', text)

# Load the DataFrame
#df = pd.read_json("data_without_hashtags.json")


# Apply remove_user_mentions function to the 'text' column
dataset['text'] = dataset['text'].apply(remove_user_mentions)

# Save the updated DataFrame if needed
dataset.to_json("updated_data_without_mentions.json", orient='records', lines=True)

In [102]:
import json
import re


# Example mapping of abbreviations to their full forms
abbreviation_mapping = {
    'OMG': 'oh my god',
    'DM': 'direct message',
    'BTW': 'by the way',
    'BRB': 'be right back',
    'RT': 'retweet',
    'FTW': 'for the win',
    'QOTD': 'quote of the day',
    'IDK': 'I do not know',
    'ICYMI': 'in case you missed it',
    'IRL': 'in real life',
    'IMHO': 'in my humble opinion',
    'IMO': 'I do not know',
    'LOL': 'laugh out loud',
    'LMAO': 'laugh my ass off',
    'NTS': 'note to self',
    'F2F': 'face to face',
    'B4': 'before',
    'DM': 'direct message',
    'CC': 'carbon copy',
    'SMH': 'shaking my head',
    'STFU': 'shut the fuck up',
    'BFN': 'by for now',
    'AFAIK': 'as far as I know',
    'TY': 'thank you',
    'YW': 'you are welcome',
    'THX': 'thanks',
    'TIL': 'today I learned',
    'AMA': 'ask me anything',
    'JK': 'just kidding',
    'NSFW': 'Not Safe for Work',
    'OOTD': 'outfit of the day',
    'TLDR': 'too long did not read',
    'TL;DR': 'too long; did not read',
    'GIF': 'graphics interchange format'
}

# Function to replace abbreviations
def replace_abbreviations(text):
    tokens = text.split()
    for i, token in enumerate(tokens):
        if token.upper() in abbreviation_mapping:
            tokens[i] = abbreviation_mapping[token.upper()]
    return ' '.join(tokens)

# Apply functions to remove hashtags and replace abbreviations to the entire 'text' column
dataset['text'] = dataset['text'].apply(lambda x: x.upper())  # Convert text to uppercase
dataset['text'] = dataset['text'].apply(replace_abbreviations)

# Restore original capitalization
original_capitalization = lambda x: ''.join([a if b.islower() else a.lower() for a, b in zip(x, dataset['text'][0])])
dataset['text'] = dataset['text'].apply(original_capitalization)

# Save the updated DataFrame to a JSON file
dataset.to_json('abbreviations_removed.json', orient='records', lines=True)

In [103]:
import pandas as pd
import emoji
import re
import json

#df = pd.read_json("data_without_hashtags.json")
df = dataset

# Function to replace emojis with words
def replace_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))  # Ensure emojis are separated by spaces


def replace_emoticons(text):
    emoticon_dict = {
    ':)': 'smile',
    ':(': 'frown',
    ':D': 'big smile',
    ':P': 'tongue out',
    ';)': 'wink',
    ':O': 'surprise',
    ':|': 'neutral',
    ':/': 'uncertain',
    ":'(": 'tears of sadness',
    ":'D": 'tears of joy',
    ':*': 'kiss',
    ':@': 'angry',
    ':x': 'mouth shut',
    ':3': 'cute',
    ':$': 'embarrassed',
    ":')": 'single tear',
    ':p': 'tongue out'
}


    # #Construct regex pattern using re.escape() to escape special characters
    # pattern = re.compile(r'(' + '|'.join(re.escape(emoticon) for emoticon in emoticon_dict.keys()) + ')', re.IGNORECASE)

    # # Replace emoticons using the pattern
    # return pattern.sub(lambda match: emoticon_dict.get(match.group().lower(), match.group()), text)

    # Convert emoticon keys to lowercase
    emoticon_dict_lower = {key.lower(): value for key, value in emoticon_dict.items()}

    # Construct regex pattern using re.escape() to escape special characters
    pattern = re.compile(r'(' + '|'.join(re.escape(emoticon) for emoticon in emoticon_dict_lower.keys()) + ')', re.IGNORECASE)

    # Replace emoticons using the pattern
    return pattern.sub(lambda match: emoticon_dict_lower.get(match.group().lower(), match.group()), text)




# Apply functions to replace emojis and emoticons and update DataFrame columns
df['text'] = df['text'].apply(replace_emojis)
df['text'] = df['text'].apply(replace_emoticons)

# Display the DataFrame
print(df)

data_dict = df.to_dict()

with open('removedEmoji.json', 'w') as f:
    json.dump(data_dict, f, indent=4)

                                                    text  isSarcastic
0      former versace store clerk sues over secret 'b...            0
1      the 'roseanne' revival catches up to our thorn...            0
2      mom starting to fear son's web series closest ...            1
3      boehner just wants wife to listen, not come up...            1
4      j.k. rowling wishes snape happy birthday in th...            0
...                                                  ...          ...
66484  @rioferdy5 personmag @paulpogba waste of money...            0
66485  technically i agree. they usually don't get ca...            1
66486  @theweeknd i miss you. come over and sing to m...            0
66487  but they hardly get proper controversial on th...            0
66488  wearing an outfit you like can make a day 10x ...            1

[66489 rows x 2 columns]


In [104]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['isSarcastic'], test_size=0.2, random_state=42)

In [92]:

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences


# Tokenize and vectorize the training text data using Tokenizer and pad_sequences
max_length = 140
tokenizer = Tokenizer()   #lower=False
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train, maxlen=max_length)

# Tokenize and vectorize the testing text data using the same Tokenizer
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_length)

In [105]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from keras_preprocessing.sequence import pad_sequences

max_length = 140

# Initialize TweetTokenizer
tweetTokenizer = TweetTokenizer()

# Tokenize training text data
X_train_tokenized = [tweetTokenizer.tokenize(text) for text in X_train]

# Tokenize testing text data
X_test_tokenized = [tweetTokenizer.tokenize(text) for text in X_test]

# Create Tokenizer instance
tokenizer = Tokenizer()

# Fit tokenizer on training text data
tokenizer.fit_on_texts(X_train_tokenized)

# Convert text data to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_tokenized)
X_test_sequences = tokenizer.texts_to_sequences(X_test_tokenized)

# Pad sequences
X_train = pad_sequences(X_train_sequences, maxlen=max_length)
X_test = pad_sequences(X_test_sequences, maxlen=max_length)

# Display shapes of resulting matrices
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (53191, 140)
Shape of X_test: (13298, 140)


In [106]:
from keras.layers import CuDNNLSTM
from keras.models import Sequential
from keras.layers import Embedding, Dense
from keras.optimizers import Adam

embedding_dim = 100

# Define the vocabulary size based on the actual number of unique words in the training data
vocab_size = len(tokenizer.word_index) + 1

max_length = 140
optimizer = Adam(learning_rate=0.000009)
m1 = Sequential()
m1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
m1.add(CuDNNLSTM(units=150))
m1.add(Dense(units=64))
m1.add(Dense(units=64))
m1.add(Dense(units=1, activation='sigmoid'))

m1.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
m1.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 140, 100)          4527300   
                                                                 
 cu_dnnlstm_5 (CuDNNLSTM)    (None, 150)               151200    
                                                                 
 dense_15 (Dense)            (None, 64)                9664      
                                                                 
 dense_16 (Dense)            (None, 64)                4160      
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 4,692,389
Trainable params: 4,692,389
Non-trainable params: 0
_________________________________________________________________


In [107]:
# Train the model
m1.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = m1.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Loss: 0.5574001669883728, Accuracy: 76.39%


In [108]:
from sklearn.metrics import precision_score, recall_score

# Predict on validation data
y_val_pred_prob_m1 = m1.predict(X_test)
y_val_pred_m1 = (y_val_pred_prob_m1 > 0.5).astype(int)  # Threshold for binary classification

# Assuming y_test is in binary format (0 or 1)
y_val_true_m1 = y_test

# Calculate precision and recall for binary classification
precision_m1 = precision_score(y_val_true_m1, y_val_pred_m1)
recall_m1 = recall_score(y_val_true_m1, y_val_pred_m1)

# print the results
print(f'Precision: {precision_m1:.4f}')
print(f'Recall: {recall_m1:.4f}')

Precision: 0.7654
Recall: 0.7065
