# Introduction
This work is to practice tensorflow skills for texts classification.

# Load data

In [1]:
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving Sarcasm_Headlines_Dataset.json.zip to Sarcasm_Headlines_Dataset.json.zip
User uploaded file "Sarcasm_Headlines_Dataset.json.zip" with length 1670920 bytes


In [2]:
!unzip /content/Sarcasm_Headlines_Dataset.json.zip

Archive:  /content/Sarcasm_Headlines_Dataset.json.zip
  inflating: Sarcasm_Headlines_Dataset.json  


# Data understanding

In [3]:
import json
import tensorflow as tf
import pandas as pd

In [4]:
def parse_data(file):
  for l in open(file,'r'):
    yield json.loads(l)

data = list(parse_data('./Sarcasm_Headlines_Dataset.json'))

In [5]:
dfTrain = pd.DataFrame(data)
dfTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   article_link  26709 non-null  object
 1   headline      26709 non-null  object
 2   is_sarcastic  26709 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [7]:
blanks = []
for index, id, label, text in dfTrain.itertuples():
  newText = str(text)
  if newText.isspace():
    blanks.append(index)
print(f'Number of observations withut text: {len(blanks)}')

Number of observations withut text: 0


In [9]:
dfTrain[['headline', 'is_sarcastic']].groupby('is_sarcastic').count().reset_index()

Unnamed: 0,is_sarcastic,headline
0,0,14985
1,1,11724


In [6]:
dfTrain.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


Observations:
* There are no missing values.
* There are no empty reviews.
* There are less sarcastic texts.

## Transformation

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
# vocabSize: number of word tokens
# embeddingDim: number of features for embedding layer
# maxLength: number of features for tokenizer

seed = 1
vocabSize = 10000
embeddingDim = 32
maxLength = 100
truncType = 'post'

In [20]:
X_train, X_test, y_train, y_test = train_test_split(dfTrain['headline'], dfTrain['is_sarcastic'], test_size=0.2, random_state=seed)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (21367,)
X_test shape: (5342,)
y_train shape: (21367,)
y_test shape: (5342,)


### Training data

In [21]:
tokenizer = Tokenizer(num_words=vocabSize, oov_token='<OOV>')

In [22]:
tokenizer.fit_on_texts(X_train)

In [23]:
trainSentences = tokenizer.texts_to_sequences(X_train)
trainSentences = pad_sequences(trainSentences, maxlen=maxLength, truncating=truncType)
trainSentences

array([[   0,    0,    0, ...,    3, 7146,  961],
       [   0,    0,    0, ...,   15,  443, 3982],
       [   0,    0,    0, ...,  371,    1,   22],
       ...,
       [   0,    0,    0, ..., 1575, 2859, 1043],
       [   0,    0,    0, ...,    1,  403,  107],
       [   0,    0,    0, ...,   21, 5931, 1036]], dtype=int32)

In [24]:
trainSentences.shape

(21367, 100)

### Testing data

In [25]:
testSentences = tokenizer.texts_to_sequences(X_test)
testSentences = pad_sequences(testSentences, maxlen=maxLength, truncating=truncType)
testSentences

array([[   0,    0,    0, ...,   67,   81,  544],
       [   0,    0,    0, ...,   33, 5482,  701],
       [   0,    0,    0, ..., 1538,   19,  410],
       ...,
       [   0,    0,    0, ...,  263, 3814, 1032],
       [   0,    0,    0, ...,    5,    4,   81],
       [   0,    0,    0, ...,    6,    7,  493]], dtype=int32)

In [26]:
testSentences.shape

(5342, 100)

# Modeling

## Model using words

In [27]:
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocabSize, embeddingDim, input_length=maxLength),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(6, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

In [28]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [29]:
history = model.fit(trainSentences, y_train, validation_data=(testSentences, y_test), epochs=50, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss')])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [30]:
model.weights[0].shape

TensorShape([10000, 32])

# References

https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection?select=Sarcasm_Headlines_Dataset.json