In [15]:
# ! pip uninstall tensorflow
# ! pip install swifter

In [5]:
import tensorflow as tf


In [23]:
# Importing libraries
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D,Dropout,LSTM,Bidirectional
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer,one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences


import pandas as pd
import numpy as np
import re,nltk,swifter
import matplotlib.pyplot as plt
import seaborn as sn
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [7]:
tf.test.is_gpu_available()
tf.config.list_physical_devices('GPU')

In [8]:
df = pd.read_csv(r'../input/sarcasm/train-balanced-sarcasm.csv')
df = df.fillna('')
df = df[['label','comment','author','score','created_utc','parent_comment']]
df.head()

In [9]:
stops = set(stopwords.words('english')) - {'no','not','nor','against','above','below','off','own'}
def clean_text(comment):
    text = str(comment)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+',' ', text)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text = ' '.join(token for token in tokenizer.tokenize(text.lower()) if token not in stops)
    text = text.lower().strip()
    return text

df["cleaned_comment"] = df.swifter.apply(lambda x: clean_text(x["comment"]),axis=1)


In [10]:
train_x, val_x,train_y , val_y = train_test_split(df.drop('label',axis=1),df['label'],random_state=123,test_size=0.20)
train_txt = train_x['cleaned_comment']
val_txt = val_x['cleaned_comment']

# Tokenization

In [11]:
tokenizer = Tokenizer(num_words=6000)
tokenizer.fit_on_texts(train_txt)
cnn_train = tokenizer.texts_to_sequences(train_txt)
cnn_val = tokenizer.texts_to_sequences(val_txt)
vocab_size = len(tokenizer.word_index) + 1  
print(f"Vocab size:{vocab_size}")

# Pad sequences

In [12]:
maxlen = 100
Xcnn_train = pad_sequences(cnn_train, padding='post', maxlen=maxlen)
Xcnn_val = pad_sequences(cnn_val, padding='post', maxlen=maxlen)

# build CNN model

In [13]:
embedding_dim = 200
cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
cnn_model.add(Conv1D(128, 5,activation = 'relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(16, activation='relu'))
cnn_model.add(Dense(8, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy'])
cnn_model.summary() 

In [14]:
cnn_model.fit(Xcnn_train, train_y,
                    epochs=3,
                    verbose=True,
                    validation_data=(Xcnn_val, val_y),
                    batch_size=10)
loss, accuracy = cnn_model.evaluate(Xcnn_train, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = cnn_model.evaluate(Xcnn_val, val_y, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy)) 

# LSTM

In [47]:
# stops_1 = {x for x in stops if len(x)<=3} - {'no','not','nor','off','own'}
stops_1 = {}
def clean_text(comment):
    text = str(comment)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+',' ', text)
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text = ' '.join(token for token in tokenizer.tokenize(text.lower()) if token not in stops_1)
    text = text.lower().strip()
    return text

df["cleaned_comment_1"] = df.swifter.apply(lambda x: clean_text(x["comment"]),axis=1)


In [48]:
corpus = [df['cleaned_comment_1'][i] for i in range( len(df))]
voc_size=5000

onehot_=[one_hot(words,voc_size)for words in corpus] 

max_sent_length = 80

embedded_docs=pad_sequences(onehot_,padding='pre',maxlen=max_sent_length)
    
embedding_vector_features=80

X_final=np.array(embedded_docs)
y_final=np.array(df['label'])

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_final, y_final, test_size=0.20, random_state=123)


In [49]:
lstm_model=Sequential()
lstm_model.add(Embedding(voc_size,embedding_vector_features,input_length=max_sent_length))
lstm_model.add(Bidirectional(LSTM(128)))
lstm_model.add(Dropout(0.3))
lstm_model.add(Flatten())
lstm_model.add(Dense(1,activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
lstm_model.summary()

In [50]:
lstm_model.fit(X_train_lstm,y_train_lstm,validation_data=(X_test_lstm,y_test_lstm),epochs=10,batch_size=10)

In [52]:
loss, accuracy = lstm_model.evaluate(X_train_lstm, y_train_lstm, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm_model.evaluate(X_test_lstm, y_test_lstm, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy)) 