# Lab 4 - Recurrent neural network

In [1]:
import re
import random
import numpy as np
import pandas as pd

from tensorflow import keras

from nltk.corpus import stopwords
from nltk import download

from sklearn.feature_extraction.text import TfidfVectorizer

## Data import - Clickbait title

In [2]:
df = pd.read_csv("clickbait_ds.csv", delimiter=';')
df.head()

Unnamed: 0,Text,Clickbait
0,Snoopy Got A Star On The Walk Of Fame And Seem...,1
1,15 Tweets That Are Too Real For Anyone Who Has...,1
2,"Try To Stay Calm '90s Kids, But The ""Full Hous...",1
3,P1 Pico Projector Stands Out From a Small Crowd,0
4,NBA: Gilbert Arenas and Javaris Crittenton sus...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Text        32000 non-null  object
 1    Clickbait  32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 500.1+ KB


In [4]:
def delete_stopwords(str_x):
    words = str_x.split(' ')
    neutral_words = ['one', 'say', 'page', 'know', 'go', 'back', 'take', 'see', 'look', 'article',
                     'edit', 'got', 'thing', 'want', 'make', 'people']
    new_words = list()
    for word in words:
        if word not in stopwords.words('english') or word not in neutral_words:
            new_words.append(word)
    return ' '.join(new_words)

## Preprocessing

In [5]:
download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ocean\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

df["Text"] = df["Text"].map(lambda x: re.sub(r'[^\w]', ' ', x))
df["Text"] = df["Text"].map(lambda x: x.lower())
df["Text"] = df["Text"].map(delete_stopwords)

## Model initialising

In [6]:
vectorizer = TfidfVectorizer()

In [7]:
X = vectorizer.fit_transform(df["Text"].values).toarray()
Y = df.iloc[:, 1].values

num_samples = 500 #X.shape[0]
train_samples = int(0.75*num_samples)

indexes = np.arange(num_samples)
random.shuffle(indexes)

In [8]:
X, Y = X[indexes], Y[indexes]
x_train, y_train = X[:train_samples, :], Y[:train_samples]
x_test, y_test = X[train_samples:, :], Y[train_samples:]

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(375, 22761) (375,)
(125, 22761) (125,)


## Model params

In [9]:
inputs = keras.Input(shape=(22761, 1))

x = keras.layers.LSTM(128)(inputs)

outputs = keras.layers.Dense(1, activation="softmax")(x)

In [10]:
rnn_model = keras.Model(inputs=inputs, outputs=outputs, name="rnn_model")
rnn_model.summary()

Model: "rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 22761, 1)]        0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 66,689
Trainable params: 66,689
Non-trainable params: 0
_________________________________________________________________


In [11]:
rnn_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)

### Training

In [12]:
rnn_model.fit(x_train, y_train, batch_size=4, epochs=1, validation_split=0.2)



<tensorflow.python.keras.callbacks.History at 0x1ddb5245040>

### Eval

In [13]:
test_scores = rnn_model.evaluate(x_test, y_test, verbose=2)

print(f"Test loss: {test_scores[0]}  |  Test accuracy:", test_scores[1])
print(")

4/4 - 10s - loss: 0.0000e+00 - accuracy: 0.4240
Test loss: 0.0
Test accuracy: 0.42399999499320984
