<a href="https://colab.research.google.com/github/RohanDeySarkar/Kaggle-challenges/blob/master/text_classification_rnn_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import csv
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
articles = []
labels = []

with open("/content/bbc-text.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        labels.append(row[0])
        article = row[1]
        for word in STOPWORDS:
            token = ' ' + word + ' '
            article = article.replace(token, ' ')
        articles.append(article)

In [10]:
print(len(labels))
print(len(articles))

2225
2225


In [11]:
set(labels)

{'business', 'entertainment', 'politics', 'sport', 'tech'}

In [12]:
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [13]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [14]:
vocab_size = 5000
embedding_dim = 64
num_epochs = 10

In [15]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(train_articles)

In [16]:
word_index = tokenizer.word_index

In [17]:
# word_index 

In [18]:
train_articles[5]

'howard hits back mongrel jibe michael howard said claim peter hain tory leader acting like  attack mongrel  shows labour  rattled  opposition.  upbeat speech party spring conference brighton  said labour campaigning tactics proved tories hitting home. mr hain made claim tory tactics anti-terror bill debate.  something tells someone  somewhere little bit rattled   mr howard said. mr hain  leader commons  told bbc radio four today programme mr howard stance government anti-terrorism legislation putting country risk. accused tory leader behaving like  attack mongrel   playing opposition opposition sake .  mr howard told party labour would  anything  say anything  claim anything cling office costs .  far year compared fagin  shylock flying pig. morning peter hain even called mongrel.  know  something tells someone  somewhere little bit rattled.  environment secretary margaret beckett rejected mr howard comment  telling radio 4 pm programme labour  rattled .  real duty try get people focus

In [19]:
# word_index['howard']

In [20]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [21]:
train_sequences = tokenizer.texts_to_sequences(train_articles)

print(train_sequences[5])

[176, 1197, 29, 1, 1, 271, 176, 2, 572, 1037, 1, 303, 264, 1943, 28, 715, 1, 573, 32, 1, 806, 3062, 757, 50, 2168, 553, 4729, 2, 32, 1906, 3270, 1377, 327, 3063, 49, 3, 1, 24, 572, 303, 3270, 225, 1075, 239, 768, 295, 2968, 1061, 3271, 282, 700, 1, 3, 176, 2, 3, 1, 264, 716, 23, 37, 188, 103, 432, 345, 3, 176, 3386, 16, 225, 1444, 1339, 1423, 81, 859, 674, 303, 264, 1, 28, 715, 1, 234, 806, 806, 1, 3, 176, 23, 50, 32, 4, 670, 70, 670, 572, 670, 1, 206, 434, 183, 5, 733, 1, 1, 2215, 1, 1445, 1037, 1, 95, 146, 1, 177, 295, 2968, 1061, 3271, 282, 700, 1, 1427, 300, 3530, 1, 1266, 3, 176, 1198, 2216, 188, 84, 4298, 345, 32, 1, 194, 1237, 322, 25, 7, 896, 271, 176, 143, 723, 484, 105, 436, 81, 6, 30, 1, 1603, 275, 147, 832, 2969, 3064, 3, 176, 2, 303, 1093, 1128, 897, 706, 1036, 4985, 1, 825, 68, 1319, 5, 48, 5, 191, 372, 2428, 23, 50, 553, 3, 176, 606, 234, 1199, 1969, 266, 3530, 1, 848, 1442, 4115, 660, 250, 4986, 2970, 50, 15, 417, 366, 142, 29, 32, 393, 92, 2429, 1, 486, 1, 486, 1, 3, 7

In [22]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [23]:
print(len(train_sequences[5]))
print(len(train_padded[5]))

355
200


In [24]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)

validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [25]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(train_labels))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(validation_labels))

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 390       
Total params: 394,694
Trainable params: 394,694
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

Epoch 1/10
56/56 - 36s - loss: 1.5773 - accuracy: 0.3489 - val_loss: 1.2505 - val_accuracy: 0.4584
Epoch 2/10
56/56 - 1s - loss: 0.7962 - accuracy: 0.7500 - val_loss: 0.5116 - val_accuracy: 0.8629
Epoch 3/10
56/56 - 1s - loss: 0.2481 - accuracy: 0.9455 - val_loss: 0.2755 - val_accuracy: 0.9124
Epoch 4/10
56/56 - 1s - loss: 0.0640 - accuracy: 0.9860 - val_loss: 0.2339 - val_accuracy: 0.9303
Epoch 5/10
56/56 - 1s - loss: 0.0152 - accuracy: 0.9972 - val_loss: 0.2071 - val_accuracy: 0.9506
Epoch 6/10
56/56 - 1s - loss: 0.0045 - accuracy: 1.0000 - val_loss: 0.2499 - val_accuracy: 0.9438
Epoch 7/10
56/56 - 1s - loss: 0.0253 - accuracy: 0.9944 - val_loss: 0.2613 - val_accuracy: 0.9258
Epoch 8/10
56/56 - 1s - loss: 0.0179 - accuracy: 0.9978 - val_loss: 0.3316 - val_accuracy: 0.9079
Epoch 9/10
56/56 - 1s - loss: 0.0046 - accuracy: 0.9994 - val_loss: 0.2601 - val_accuracy: 0.9438
Epoch 10/10
56/56 - 1s - loss: 0.0014 - accuracy: 1.0000 - val_loss: 0.2740 - val_accuracy: 0.9416


In [31]:
txt = ["A WeWork shareholder has taken the company to court over the near-$1.7bn (£1.3bn) leaving package approved for ousted co-founder Adam Neumann."]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_length)
pred = np.argmax(model.predict(padded))
labels = ['sport', 'business', 'politics', 'tech', 'entertainment']
output = labels[pred - 1]
print(output)

entertainment
