In [111]:
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd

In [112]:
from sklearn.model_selection import train_test_split

# Read the dataset
dataset = pd.read_json("../data_without_hashtags.json")

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['isSarcastic'], test_size=0.2, random_state=42)

In [None]:
dataset = pd.read_json("../data_without_hashtags.json")

In [None]:
################################################################################
#Preprocessing section here

In [101]:
#undersampling here

# Count the number of instances in each class
class_counts = dataset['isSarcastic'].value_counts()

# Find the class with more items
majority_class = class_counts.idxmax()

# Find the class with fewer items
minority_class = class_counts.idxmin()

# Count the number of instances in the minority class
minority_class_count = class_counts[minority_class]

# Sample the majority class to match the number of instances in the minority class
majority_class_sampled = dataset[dataset['isSarcastic'] == majority_class].sample(n=minority_class_count, random_state=42)

# Concatenate the sampled majority class with the minority class
balanced_data = pd.concat([majority_class_sampled, dataset[dataset['isSarcastic'] == minority_class]])

# Shuffle the balanced dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = balanced_data
# Now, 'balanced_data' contains a balanced dataset where both classes have the same number of instances

sarcastic_counts = dataset['isSarcastic'].value_counts()

# Display the result
print("Number of rows for each value in the 'isSarcastic' column:")
print(sarcastic_counts)

Number of rows for each value in the 'isSarcastic' column:
isSarcastic
0    18488
1    18488
Name: count, dtype: int64


In [None]:
######################################
#remove @ with Person


In [97]:
import re

def remove_user_mentions(text):
    # Define the regex pattern to match @ mentions followed by numbers
    pattern = re.compile(r'@\d+')

    # Remove @ mentions using the pattern
    return pattern.sub('person', text)

# Load the DataFrame
#df = pd.read_json("data_without_hashtags.json")


# Apply remove_user_mentions function to the 'text' column
dataset['text'] = dataset['text'].apply(remove_user_mentions)

# Save the updated DataFrame if needed
dataset.to_json("updated_data_without_mentions.json", orient='records', lines=True)


In [None]:
#######################################################################
#replace abbreviations

In [98]:
import json
import re


# Example mapping of abbreviations to their full forms
abbreviation_mapping = {
    'OMG': 'oh my god',
    'DM': 'direct message',
    'BTW': 'by the way',
    'BRB': 'be right back',
    'RT': 'retweet',
    'FTW': 'for the win',
    'QOTD': 'quote of the day',
    'IDK': 'I do not know',
    'ICYMI': 'in case you missed it',
    'IRL': 'in real life',
    'IMHO': 'in my humble opinion',
    'IMO': 'I do not know',
    'LOL': 'laugh out loud',
    'LMAO': 'laugh my ass off',
    'NTS': 'note to self',
    'F2F': 'face to face',
    'B4': 'before',
    'DM': 'direct message',
    'CC': 'carbon copy',
    'SMH': 'shaking my head',
    'STFU': 'shut the fuck up',
    'BFN': 'by for now',
    'AFAIK': 'as far as I know',
    'TY': 'thank you',
    'YW': 'you are welcome',
    'THX': 'thanks',
    'TIL': 'today I learned',
    'AMA': 'ask me anything',
    'JK': 'just kidding',
    'NSFW': 'Not Safe for Work',
    'OOTD': 'outfit of the day',
    'TLDR': 'too long did not read',
    'TL;DR': 'too long; did not read',
    'GIF': 'graphics interchange format'
}

# Function to replace abbreviations
def replace_abbreviations(text):
    tokens = text.split()
    for i, token in enumerate(tokens):
        if token.upper() in abbreviation_mapping:
            tokens[i] = abbreviation_mapping[token.upper()]
    return ' '.join(tokens)

# Apply functions to remove hashtags and replace abbreviations to the entire 'text' column
dataset['text'] = dataset['text'].apply(lambda x: x.upper())  # Convert text to uppercase
dataset['text'] = dataset['text'].apply(replace_abbreviations)

# Restore original capitalization
original_capitalization = lambda x: ''.join([a if b.islower() else a.lower() for a, b in zip(x, dataset['text'][0])])
dataset['text'] = dataset['text'].apply(original_capitalization)

# Save the updated DataFrame to a JSON file
dataset.to_json('abbreviations_removed.json', orient='records', lines=True)

In [None]:
###########################################################
#hashtag removed

In [3]:
import re
import pandas as pd
import json

# Example DataFrame with 'text' column containing Twitter data
#dataset = pd.DataFrame({'text': ["This is a tweet with #hashtags", "Another tweet with #morehashtags", "Yet another tweet with #hashtags"]})

# Function to remove hashtags from a single text
def remove_hashtags(text):
    pattern = r'\#\w+'
    return re.sub(pattern, '', text)

# Apply the function to remove hashtags to the entire 'text' column
dataset['text'] = dataset['text'].apply(remove_hashtags)


data_dict = dataset.to_dict()

with open('all#Removed.json', 'w') as f:
    json.dump(data_dict, f, indent=4)

In [None]:
############################################################################
#replace emji and emoticon

In [99]:
import pandas as pd
import emoji
import re
import json

#df = pd.read_json("data_without_hashtags.json")
df = dataset

# Function to replace emojis with words
def replace_emojis(text):
    return emoji.demojize(text, delimiters=(" ", " "))  # Ensure emojis are separated by spaces


def replace_emoticons(text):
    emoticon_dict = {
    ':)': 'smile',
    ':(': 'frown',
    ':D': 'big smile',
    ':P': 'tongue out',
    ';)': 'wink',
    ':O': 'surprise',
    ':|': 'neutral',
    ':/': 'uncertain',
    ":'(": 'tears of sadness',
    ":'D": 'tears of joy',
    ':*': 'kiss',
    ':@': 'angry',
    ':x': 'mouth shut',
    ':3': 'cute',
    ':$': 'embarrassed',
    ":')": 'single tear',
    ':p': 'tongue out'
}


    # #Construct regex pattern using re.escape() to escape special characters
    # pattern = re.compile(r'(' + '|'.join(re.escape(emoticon) for emoticon in emoticon_dict.keys()) + ')', re.IGNORECASE)

    # # Replace emoticons using the pattern
    # return pattern.sub(lambda match: emoticon_dict.get(match.group().lower(), match.group()), text)

    # Convert emoticon keys to lowercase
    emoticon_dict_lower = {key.lower(): value for key, value in emoticon_dict.items()}

    # Construct regex pattern using re.escape() to escape special characters
    pattern = re.compile(r'(' + '|'.join(re.escape(emoticon) for emoticon in emoticon_dict_lower.keys()) + ')', re.IGNORECASE)

    # Replace emoticons using the pattern
    return pattern.sub(lambda match: emoticon_dict_lower.get(match.group().lower(), match.group()), text)




# Apply functions to replace emojis and emoticons and update DataFrame columns
df['text'] = df['text'].apply(replace_emojis)
df['text'] = df['text'].apply(replace_emoticons)

# Display the DataFrame
print(df)

data_dict = df.to_dict()

with open('removedEmoji.json', 'w') as f:
    json.dump(data_dict, f, indent=4)

                                                    text  isSarcastic
0      personyes i hope youre lurking rn. i want to l...            0
1      05 really taught me a valuable lesson i'm neve...            0
2      personberry never had a voice to protest, so y...            0
3      personhmyst4rs rest in peace & love to you and...            0
4      100 days until christmas!  evergreen_tree  #to...            0
...                                                  ...          ...
39775  @zendaya i could see the makeup artists giving...            1
39776  @ziggiwatkins11 slvr... that's great name #not...            1
39777  @zoso4986 @nero he is the fag we need but not ...            1
39778  zuma sounding like kanye west right now trying...            1
39779  @zzucru @uwdawgpack so true. students - stick ...            1

[39780 rows x 2 columns]


In [None]:
#####################################################################


In [67]:
%pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
   ---------------------------------------- 0.0/289.9 kB ? eta -:--:--
   ---- ----------------------------------- 30.7/289.9 kB 1.3 MB/s eta 0:00:01
   ---------------- ----------------------- 122.9/289.9 kB 1.8 MB/s eta 0:00:01
   ----------------------------------- ---- 256.0/289.9 kB 2.2 MB/s eta 0:00:01
   ----------------------------

In [100]:
import pandas as pd
import contractions

# Sample DataFrame
# dataset = pd.DataFrame({
#     'text': ["I didn't go to the party yesterday.",
#              "She can't believe what happened."]
# })

# Function to expand contractions
def expand_contractions(text):
    return contractions.fix(text)

# Apply the function to expand contractions
dataset['text'] = dataset['text'].apply(expand_contractions)

In [None]:
#################################################################################

In [113]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from keras_preprocessing.sequence import pad_sequences

# Read the dataset
# dataset = pd.read_json("data_without_hashtags.json")

# Example data
#X_train = ["This is a tweet!", "Another tweet here."]
#X_test = ["Yet another tweet!", "And one more tweet."]

# Define max_length
max_length = 140

# Initialize TweetTokenizer
tweetTokenizer = TweetTokenizer()

# Tokenize training text data
X_train_tokenized = [tweetTokenizer.tokenize(text) for text in X_train]

# Tokenize testing text data
X_test_tokenized = [tweetTokenizer.tokenize(text) for text in X_test]

# Create Tokenizer instance
tokenizer = Tokenizer()

# Fit tokenizer on training text data
tokenizer.fit_on_texts(X_train_tokenized)

# Convert text data to sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train_tokenized)
X_test_sequences = tokenizer.texts_to_sequences(X_test_tokenized)

# Pad sequences
X_train = pad_sequences(X_train_sequences, maxlen=max_length)
X_test = pad_sequences(X_test_sequences, maxlen=max_length)

# Display shapes of resulting matrices
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)


Shape of X_train: (31824, 140)
Shape of X_test: (7956, 140)


In [119]:
from keras.layers import CuDNNLSTM
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.optimizers import Adam

embedding_dim = 100

# Define the vocabulary size based on the actual number of unique words in the training data
vocab_size = len(tokenizer.word_index) + 1

max_length = 140

optimizer = Adam(learning_rate=0.000009)
m1 = Sequential()
m1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
m1.add(CuDNNLSTM(units=150))
m1.add(Dense(units=64))
m1.add(Dense(units=64))
m1.add(Dense(units=1, activation='sigmoid'))

m1.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
m1.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 140, 100)          3342900   
                                                                 
 cu_dnnlstm_14 (CuDNNLSTM)   (None, 150)               151200    
                                                                 
 dense_38 (Dense)            (None, 64)                9664      
                                                                 
 dense_39 (Dense)            (None, 64)                4160      
                                                                 
 dense_40 (Dense)            (None, 1)                 65        
                                                                 
Total params: 3,507,989
Trainable params: 3,507,989
Non-trainable params: 0
_________________________________________________________________


In [115]:
# Train the model
m1.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = m1.evaluate(X_test, y_test)
print(f'Loss: {loss}, Accuracy: {accuracy * 100:.2f}%')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss: 1.236772894859314, Accuracy: 74.21%


In [116]:
from sklearn.metrics import precision_score, recall_score

# Predict on validation data
y_val_pred_prob_m1 = m1.predict(X_test)
y_val_pred_m1 = (y_val_pred_prob_m1 > 0.5).astype(int)  # Threshold for binary classification

# Assuming y_test is in binary format (0 or 1)
y_val_true_m1 = y_test

# Calculate precision and recall for binary classification
precision_m1 = precision_score(y_val_true_m1, y_val_pred_m1)
recall_m1 = recall_score(y_val_true_m1, y_val_pred_m1)

# print the results
print(f'Precision: {precision_m1:.4f}')
print(f'Recall: {recall_m1:.4f}')

Precision: 0.7080
Recall: 0.7541


In [118]:
from sklearn.metrics import f1_score

f1_m1 = f1_score(y_val_true_m1, y_val_pred_m1)
print(f'F1-score: {f1_m1:.4f}')


F1-score: 0.7304
