In [117]:
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from NCModel import NCModel
import re
import pandas as pd
import matplotlib.pyplot as plt

In [118]:
# Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [119]:
# removing non alphanumeric character
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

def preprocess(df):
    df['title'] = df['title'].str.lower()
    df['title'] = df['title'].apply(alpha_num)
    df['title'] = df['title'].apply(remove_stopwords)
    df['body'] = df['body'].str.lower()
    df['body'] = df['body'].apply(alpha_num)
    df['body'] = df['body'].apply(remove_stopwords)
    return df

# Limit GPU(Can be ignored)

In [120]:
# using_gpu_index = 0
# gpu_list = tf.config.experimental.list_physical_devices('GPU')
# if len(gpu_list) > 0:
#     try:
#         tf.config.experimental.set_virtual_device_configuration(
#             gpu_list[using_gpu_index],
#             [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]  # limit the size of GPU memory
#         )
#     except RuntimeError as e:
#         print(e)
# else:
#     print("Got no GPUs")

# Data

In [121]:
train_df = pd.read_csv("./dataset/preprocessed_train.csv")
train_df

Unnamed: 0,id,category,content
0,234324,31,handle wrongway driver coming clevelands highw...
1,255943,29,29 southern california zips make us list prici...
2,94883,21,getting youth back work policy lessons around ...
3,73321,0,katniss jon snows worst enemy hunger games got...
4,149036,0,george clooney talks love amal s devastatingly...
...,...,...,...
199995,197218,17,bobbi linden texas mom buys ipad gets box stuf...
199996,12917,2,letting go feeling goodletting go isnt easy es...
199997,115704,9,oklahoma teachers union calls end walkoutschoo...
199998,318999,26,sc activists concerned toxins former upstate p...


In [122]:
# parameters
vocab_size = 7000
max_length = 200
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
train_x, train_y = train_df['content'].to_numpy(), train_df['category'].to_numpy().reshape(len(train_df['category']),1)

# train-test split
train_x,valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, stratify=train_y)

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)

In [123]:
# convert train dataset to sequence and pad sequences
train_x = tokenizer.texts_to_sequences(train_x)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, padding=padding_type, truncating=trunc_type, maxlen=max_length)

In [124]:
len(train_x)

160000

In [125]:
# txl = [len(i) for i in train_x]
# txl.sort()
# # print(txl)
# print([min(txl),txl[45000],txl[135000],max(txl)])
# plt.boxplot()
# plt.show()

In [126]:
# convert valid dataset to sequence and pad sequences
valid_x = tokenizer.texts_to_sequences(valid_x)
valid_x = tf.keras.preprocessing.sequence.pad_sequences(valid_x, padding=padding_type, truncating=trunc_type, maxlen=max_length)

# Model Construction

In [131]:
model = NCModel(input_dim=vocab_size,input_length=max_length,classes=32)
model.build((None,max_length))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer= Adam(),
              metrics=['accuracy'])
model.summary()

Model: "nc_model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     multiple                  224000    
                                                                 
 lstm_8 (LSTM)               multiple                  8320      
                                                                 
 dense_8 (Dense)             multiple                  1056      
                                                                 
Total params: 233,376
Trainable params: 233,376
Non-trainable params: 0
_________________________________________________________________


# Train

In [128]:
history = model.fit(train_x, train_y, batch_size=32,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Validation

In [129]:
model.evaluate(valid_x,valid_y,return_dict=True)



{'loss': 1.304148554801941, 'accuracy': 0.6411499977111816}