In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)

In [3]:
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
df = data.drop('article_link', axis=1)

In [5]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [6]:
df.shape

(26709, 2)

In [7]:
df.describe()

Unnamed: 0,is_sarcastic
count,26709.0
mean,0.438953
std,0.496269
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      26709 non-null  object
 1   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 417.5+ KB


In [9]:
df.isnull().sum()

headline        0
is_sarcastic    0
dtype: int64

In [10]:
type(df)

pandas.core.frame.DataFrame

In [11]:
df.is_sarcastic.value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [12]:
df.columns

Index(['headline', 'is_sarcastic'], dtype='object')

In [13]:
# clean headlind column using helper funciton
import re

def clean_headline(text):
  text = re.sub("[^a-zA-Z]", " ", str(text))
  return re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)

df['headline'] = df.headline.apply(clean_headline)


In [14]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret b...,0
1,the roseanne revival catches up to our thorn...,0
2,mom starting to fear son s web series closest ...,1
3,boehner just wants wife to listen not come up...,1
4,j k rowling wishes snape happy birthday in th...,0


In [15]:
# input and output columns
X = df.headline
y = df.is_sarcastic

In [16]:
# tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='xxxxxx')

In [17]:
tokenizer.fit_on_texts(X)

In [18]:
X_dict = tokenizer.word_index
print(len(X_dict))

24899


In [19]:
X_dict.items()



In [20]:
# text to sequences
X_seq = tokenizer.texts_to_sequences(X)
X_seq[:10]

[[304, 1, 643, 3115, 2291, 52, 362, 95, 2090, 7, 2584, 8131],
 [4, 8132, 3339, 2761, 25, 2, 161, 8133, 403, 2914, 7, 247, 10, 988],
 [124, 845, 2, 823, 225, 5, 2091, 582, 4622, 209, 90, 42, 50, 2, 1],
 [1335, 40, 218, 349, 2, 1687, 32, 308, 25, 11, 2915, 1388, 6763, 887],
 [744, 568, 3855, 916, 1, 558, 559, 6, 4, 98, 1258, 96],
 [1, 4, 68, 5, 63],
 [4, 6764, 350, 7, 450, 4222, 2180, 1336],
 [22, 429, 42, 1137, 36, 149, 2, 99, 86, 18, 151, 7, 37, 296],
 [244, 3598, 6765, 551, 5139, 1835, 140],
 [708, 5, 320, 345, 404, 12, 5, 1, 7, 4, 3856]]

In [21]:
# padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_X_Seq = pad_sequences(X_seq, padding='post', maxlen=100)
padded_X_Seq[:3]

array([[ 304,    1,  643, 3115, 2291,   52,  362,   95, 2090,    7, 2584,
        8131,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0],
       [   4, 8132, 3339, 2761,   25,    2,  161, 8133,  403, 2914,    7,
         247,   10,  988,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    

In [22]:
padded_X_Seq.shape

(26709, 100)

In [23]:
type(padded_X_Seq)

numpy.ndarray

In [24]:
type(y)

pandas.core.series.Series

In [25]:
# convert Series into numpy ndarray
y = np.array(y)
y = y.flatten()

In [26]:
y.shape

(26709,)

In [27]:
type(y)

numpy.ndarray

In [28]:
# deep learning model building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

text_model = Sequential([
  Embedding(input_dim=10000, input_length=100, output_dim=50),
  Flatten(),
  Dense(24, activation='relu'),
  Dense(1, activation='sigmoid')
])

text_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           500000    
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense (Dense)                (None, 24)                120024    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 620,049
Trainable params: 620,049
Non-trainable params: 0
_________________________________________________________________


In [29]:
# compile
text_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [30]:
# train
text_model.fit(padded_X_Seq, y, epochs=32, batch_size=512)

Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


<tensorflow.python.keras.callbacks.History at 0x7f2e4c43b490>