<h1><b>Importing required Libraries

In [1]:
import numpy as np
import pandas as pd 

In [2]:
#Training Data
corpus_df = pd.read_csv('/content/Train_Data.csv')
corpus_df.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [3]:
#Training data shape
corpus_df.shape

(44262, 2)

<h2><b>Simple Text Pre-Processing

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [5]:
#parts of speech tagging
def pos_tag(doc):
  tagged_tokens=nltk.pos_tag(doc)
  return tagged_tokens

In [6]:
from nltk.corpus import wordnet
def pos_tag_wordnet(tagged_tokens):
  tag_map={'j':wordnet.ADJ,
           'v':wordnet.VERB,
           'r':wordnet.ADV}
  new_tagged_tokens = [(word,tag_map.get(tag[0].lower(),wordnet.NOUN))
                        for word,tag in tagged_tokens]

  return new_tagged_tokens

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl=WordNetLemmatizer()

def normalize_document(doc):
  doc = re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)
  doc = doc.lower()
  doc = doc.split()
  doc = [word for word in doc if word not in stopwords.words('english')] # remove all the stopwords
  tagged_tokens=pos_tag(doc)
  wordnet_tokens=pos_tag_wordnet(tagged_tokens)
  lemmatized_text=' '.join(wnl.lemmatize(word,tag) for word,tag in wordnet_tokens)
  return lemmatized_text

corpus_df['headline']=corpus_df['headline'].apply(normalize_document)
corpus_df.head(5)

Unnamed: 0,headline,is_sarcastic
0,supreme court vote legalize worldly vice,1
1,hungover man horrify learn make dozen plan las...,1
2,emilys list founder woman problem solver congress,0
3,send kid back school confidence,0
4,watch expert talk pesticide health,0


<h1><b>TF-IDF Model

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0. , max_df=1.,use_idf=True)


In [9]:
tv_matrix =tv.fit_transform(corpus_df['headline'])


In [10]:
tv_matrix=tv_matrix.toarray()

In [11]:
vocab=tv.get_feature_names()

In [12]:
df=pd.DataFrame(tv_matrix,columns=vocab)

In [13]:
y=corpus_df['is_sarcastic']
X=df

In [14]:
X

Unnamed: 0,aaa,aaron,aarp,aatish,ab,abandon,abandoned,abaya,abbas,abbey,abbi,abby,abc,abcs,abdeslam,abdomen,abdominable,abduct,abduction,abdul,abduljabbar,abdulrahman,abduls,abedin,abes,abhorrent,ability,abject,able,aboard,abolish,abominable,abomination,abort,abortion,abouncin,aboveground,abraham,abrams,abramson,...,zika,zimbabwe,zimmerman,zimmermans,zinc,zinfandel,zinger,zinke,zinn,zinnia,zion,zionism,zionist,zip,zipcode,zipline,ziplines,ziploc,zippori,zissu,ziyi,zod,zodiac,zoe,zoetrope,zogby,zohan,zombie,zone,zoo,zookeeper,zoolander,zoologist,zoom,zoroastrianism,zsa,zucker,zuckerberg,zuckerbergs,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44260,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
print(X.shape,y.shape)

(44262, 22866) (44262,)


In [16]:
np.all(np.isfinite(X))

True

In [17]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import text,sequence

tokenizer = Tokenizer(num_words=25000)
tokenizer.fit_on_texts(list(corpus_df['headline']))
seq = tokenizer.texts_to_sequences(corpus_df['headline'])
pad=sequence.pad_sequences(seq,maxlen=150)


In [18]:
from tensorflow.keras.layers import Dense,Activation, Embedding, LSTM, Bidirectional,Dropout
from tensorflow.keras import Input, Model
from tensorflow import keras
from keras.models import Sequential

model=Sequential()
model.add(Embedding(25000,128,input_length=150))
model.add(Bidirectional(LSTM(50)))
model.add(Dense(50,activation="relu"))
model.add(Dense(10,activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1,activation="sigmoid"))
model.add(Dense(1))

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 128)          3200000   
_________________________________________________________________
bidirectional (Bidirectional (None, 100)               71600     
_________________________________________________________________
dense (Dense)                (None, 50)                5050      
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 2

In [20]:
y

0        1
1        1
2        0
3        0
4        0
        ..
44257    0
44258    0
44259    0
44260    0
44261    1
Name: is_sarcastic, Length: 44262, dtype: int64

In [21]:
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'])

In [26]:
model.fit(pad,y,batch_size=64,epochs=9,validation_split=0.2)

Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


<keras.callbacks.History at 0x7f61adab3b50>

In [27]:
model.evaluate(pad,y)



[0.34162282943725586, 0.9727757573127747]

In [28]:
test_data=pd.read_csv('/content/Test_Data.csv')
test_data

Unnamed: 0,headline
0,area stand-up comedian questions the deal with...
1,dozens of glowing exit signs mercilessly taunt...
2,perfect response to heckler somewhere in prop ...
3,gop prays for ossoff lossoff
4,trevor noah says the scary truth about trump's...
...,...
11061,house conservatives claim democrats have faile...
11062,area man having one of his little bursts of en...
11063,there is nothing libertarian about conservatives
11064,mike pompeo startled after seeing 'beware of h...


In [29]:
X_test = test_data['headline'].values
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = sequence.pad_sequences(test_seq, maxlen=150)

In [30]:
test_pad

array([[    0,     0,     0, ...,   204,   413, 10723],
       [    0,     0,     0, ...,  3281, 21953,   194],
       [    0,     0,     0, ..., 16244,  5752,  5737],
       ...,
       [    0,     0,     0, ...,   519,  4480,  9560],
       [    0,     0,     0, ...,   800,   903,  2022],
       [    0,     0,     0, ...,    79,   867,   853]], dtype=int32)

In [31]:
y_pred=model.predict(test_pad)

In [32]:
y_pred.shape

(11066, 1)

In [33]:
y_prediction=[]
for prediction in y_pred:
  if prediction>0.5:
    prediction=1
    y_prediction.append(prediction)
  else:
    prediction=0
    y_prediction.append(prediction)

In [34]:
submission =  pd.DataFrame({'prediction':y_prediction})
submission.head()

Unnamed: 0,prediction
0,1
1,1
2,1
3,0
4,0


In [35]:
submission.to_csv('submission.csv',index=False)