In [1]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.models import Sequential
from keras.layers import Embedding,Dense,GlobalAveragePooling1D
from NCModel import NCModel
import re
import numpy as np
import pandas as pd

In [2]:
# Stopwords list from https://github.com/Yoast/YoastSEO.js/blob/develop/src/config/stopwords.js
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [3]:
# removing non alphanumeric character
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

def preprocess(df):
    df['title'] = df['title'].str.lower()
    df['title'] = df['title'].apply(alpha_num)
    df['title'] = df['title'].apply(remove_stopwords)
    return df

# Limit GPU(Can be ignored)

In [4]:
# using_gpu_index = 0
# gpu_list = tf.config.experimental.list_physical_devices('GPU')
# if len(gpu_list) > 0:
#     try:
#         tf.config.experimental.set_virtual_device_configuration(
#             gpu_list[using_gpu_index],
#             [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4096)]  # limit the size of GPU memory
#         )
#     except RuntimeError as e:
#         print(e)
# else:
#     print("Got no GPUs")

# Data

In [5]:
train_df = pd.read_csv("./dataset/train.csv")
train_df = preprocess(train_df)
train_df

Unnamed: 0,id,category,title,body
0,234324,31,handle wrongway driver coming clevelands highways,"CLEVELAND, Ohio (WOIO) -In the past year there..."
1,255943,29,29 southern california zips make us list prici...,It s no secret that Southern California homes ...
2,94883,21,getting youth back work policy lessons around ...,"Without the right skills, people will languish..."
3,73321,0,katniss jon snows worst enemy hunger games got...,Team Katniss Everdeen vs. Team Jon Snow.
4,149036,0,george clooney talks love amal s devastatingly...,The actor described his love for his wife on D...
...,...,...,...,...
199995,197218,17,bobbi linden texas mom buys ipad gets box stuf...,Courtney Akers wanted a tablet for her 15th bi...
199996,12917,2,letting go feeling good,"Letting go isn't easy, especially in relations..."
199997,115704,9,oklahoma teachers union calls end walkout,Schools will reopen soon after negotiations st...
199998,318999,26,sc activists concerned toxins former upstate p...,Members of an environmental group are concerne...


In [6]:
test_df = pd.read_csv("./dataset/test.csv")
test_df = preprocess(test_df)
test_df

Unnamed: 0,id,title,body
0,0,gif shows schuylkill rivers dramatic transform...,The skyline around is such an iconic part of...
1,1,20 pieces 90sinspired wall art,It s no secret that were 90s-obsessed here at...
2,2,white house comes effort block whitecollar cri...,"Sweeping changes would affect food safety, the..."
3,3,maya apocalypse 2012 end world valladolid mexi...,The young and aggressively tanned festival goe...
4,4,parents remarried need include stepparents wed...,While a bride and groom's parents have pretty ...
...,...,...,...
99995,99995,stephen smith finds feeling sorry patriots tom...,Tom Brady has been quite successful over the c...
99996,99996,im milk milks,"Wait, you thought Maurice Sendak's books were ..."
99997,99997,republican friends please please break,I need you to hear me out even though you may ...
99998,99998,not woods yet beach erosion threat may continu...,"GRAND HAVEN, MI -- Lakeshore homeowners will..."


In [7]:
# parameters
vocab_size = 7000
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [8]:
train_x, train_y = train_df['title'].to_numpy(), train_df['category'].to_numpy().reshape(len(train_df['category']),1)
test_x = test_df['title'].to_numpy()

# train-test split
train_x,valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, stratify=train_y)

# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_x = tokenizer.texts_to_sequences(train_x)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, padding=padding_type, maxlen=max_length)

# convert valid dataset to sequence and pad sequences
valid_x = tokenizer.texts_to_sequences(valid_x)
valid_x = tf.keras.preprocessing.sequence.pad_sequences(valid_x, padding=padding_type, maxlen=max_length)

# Model Construction

In [9]:
model = NCModel(input_dim=vocab_size,input_length=max_length,classes=32)
model.build((None,max_length))
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer= Adam(),
              metrics=['accuracy'])
model.summary()

Model: "nc_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  448000    
                                                                 
 conv1d (Conv1D)             multiple                  24704     
                                                                 
 batch_normalization (BatchN  multiple                 512       
 ormalization)                                                   
                                                                 
 activation (Activation)     multiple                  0         
                                                                 
 conv1d_1 (Conv1D)           multiple                  49280     
                                                                 
 batch_normalization_1 (Batc  multiple                 512       
 hNormalization)                                          

# Train

In [10]:
train_x.shape

(160000, 100)

In [None]:
history = model.fit(train_x, train_y, batch_size=32,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

# Validation

In [66]:
model.evaluate(valid_x,valid_y,return_dict=True)



{'loss': 1.3421082496643066, 'accuracy': 0.6345000267028809}