In [1]:
import keras
from sklearn.model_selection import train_test_split
import numpy as np
import os
import string
import re
import io
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from zipfile import ZipFile
import requests
import gzip
import tensorflow as tf
from keras import backend as K

In [2]:
if not os.path.isdir("bbc"):
    !wget "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
    ZipFile("bbc-fulltext.zip").extractall()
if not os.path.isfile("glove.6B.50d.txt.gz"):
    !wget "https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/blob/master/glove.6B.50d.txt.gz?raw=true" -O "glove.6B.50d.txt.gz"

In [3]:
business_text_files = os.listdir("bbc/business")
entertainment_text_files = os.listdir("bbc/entertainment")
politics_text_files = os.listdir("bbc/politics")
tech_text_files = os.listdir("bbc/tech")
sports_text_files = os.listdir("bbc/sport")

In [4]:
def read_text(file,directory):
    file_path = directory + "/" + file
    #print(file_path)
    try:
        with open(file_path,'r') as f:
            text = f.read()

    # at least one file is ISO-8859-14 encoded. That could cause some issues unless accounted for
    except UnicodeDecodeError:
        with open(file_path,'r',encoding="ISO-8859-14") as f:
            text = f.read()
    return text

business_texts = [read_text(text_file,directory="bbc/business") for text_file in business_text_files]
entertainment_texts = [read_text(text_file,directory="bbc/entertainment") for text_file in entertainment_text_files]
politics_texts = [read_text(text_file,directory="bbc/politics") for text_file in politics_text_files]
tech_texts = [read_text(text_file,directory="bbc/tech") for text_file in tech_text_files]
sport_texts = [read_text(text_file,directory="bbc/sport") for text_file in sports_text_files]

In [5]:
all_texts = [business_texts, entertainment_texts, politics_texts, tech_texts, sport_texts]
text_idx = [0,1,2,3,4]
article_types = ["business","entertainment","politics","tech","sports"]
class_dict = dict(zip(text_idx,article_types))

In [6]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
             "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
             "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other",
             "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's",
              "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves",
             "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those",
             "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've",
             "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
             "yourself", "yourselves" ]

In [7]:
df = pd.DataFrame([[text,label] for (texts, label) in zip(all_texts,text_idx) for text in texts],columns=["text","label"])
df_train, df_test = train_test_split(df, train_size=.8,random_state=111)
df_train = df_train.copy()
df_test = df_test.copy()

In [8]:
def process_text(text):
    processed_text = " ".join([word for word in re.sub("\.+", ". ", re.sub("[\(\)\[\]\"\']","",text.replace("\n|\w+", " "))).split(" ") if word.lower().strip() not in stopwords])
    return processed_text

def remove_punctuation(text):
     return text.translate(str.maketrans('', '', string.punctuation))

In [9]:
all_sentences = [sentence.strip() for text in df_train.text.values for sentence in text.split(".") if sentence.strip() != ""]

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(all_sentences)

word_index = tokenizer.word_index
reverse_idx = {value :key for (key, value) in word_index.items()}

maxlen = 500

In [10]:
df_train['tokenized'] = df_train.text.apply(lambda text: tokenizer.texts_to_sequences([text])[0])
df_train["tokenized"] = [sequence for sequence in pad_sequences(df_train.tokenized.values,maxlen=maxlen,truncating="post",padding="post")]

df_test['tokenized'] = df_test.text.apply(lambda text: tokenizer.texts_to_sequences([text])[0])
df_test["tokenized"] = [sequence for sequence in pad_sequences(df_test.tokenized.values,maxlen=maxlen,truncating="post",padding="post")]
train_X = np.vstack(df_train["tokenized"].values)
test_X = np.vstack(df_test["tokenized"].values)

train_y = to_categorical(df_train.label.values)
test_y = to_categorical(df_test.label.values)

In [11]:
with gzip.open("glove.6B.50d.txt.gz", 'r') as f:
    embedding_list = f.read().decode("utf-8")

In [12]:
embedding_vectors = {}
for embedding_line in embedding_list.split("\n"):
    embedding_split = embedding_line.split(" ")
    embedding_vectors[embedding_split[0]] = embedding_split[1:]

vocab_size = len(word_index.keys()) + 1

embedding_matrix = np.zeros((vocab_size,50))

for word,i  in word_index.items():
    vector = embedding_vectors.get(word)
    if vector is not None:
        embedding_matrix[i,:] = vector

In [13]:
vocab_size = len(word_index.keys()) + 1

embedding_matrix = np.zeros((vocab_size,50))

for word,i  in word_index.items():
    vector = embedding_vectors.get(word)
    if vector is not None:
        embedding_matrix[i,:] = vector

In [53]:
## Model 1 ##

model1 = keras.models.Sequential([keras.layers.Embedding(vocab_size, 50, input_length = maxlen, weights= [embedding_matrix],
                                                        trainable=False, mask_zero=True),
                                 keras.layers.Conv1D(64, 10,activation='relu'),
                                 keras.layers.MaxPooling1D(4),
                                 keras.layers.Conv1D(96, 5, activation='relu'),
                                 keras.layers.GlobalAveragePooling1D(),
                                 keras.layers.Dense(15,activation="relu"),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(5, activation="softmax")
                                ])

model1.compile(loss="categorical_crossentropy", optimizer="adam",metrics=['accuracy'])
model1.summary()

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 500, 50)           1481250   
_________________________________________________________________
conv1d_28 (Conv1D)           (None, 491, 64)           32064     
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 122, 64)           0         
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 118, 96)           30816     
_________________________________________________________________
global_average_pooling1d_17  (None, 96)                0         
_________________________________________________________________
dense_37 (Dense)             (None, 15)                1455      
_________________________________________________________________
dropout_33 (Dropout)         (None, 15)              

In [54]:
model1.fit(train_X, train_y,validation_data=(test_X,test_y),
            epochs=23, batch_size=32, steps_per_epoch= 55,validation_steps=32,validation_batch_size=13,
          workers=5)

model1.save("cnn-model-glove")

Epoch 1/23
Epoch 2/23
Epoch 3/23
Epoch 4/23
Epoch 5/23
Epoch 6/23
Epoch 7/23
Epoch 8/23
Epoch 9/23
Epoch 10/23
Epoch 11/23
Epoch 12/23
Epoch 13/23
Epoch 14/23
Epoch 15/23
Epoch 16/23
Epoch 17/23
Epoch 18/23
Epoch 19/23
Epoch 20/23
Epoch 21/23
Epoch 22/23
Epoch 23/23
INFO:tensorflow:Assets written to: cnn-model-glove/assets


In [22]:
## Model 2 ##
model2 = keras.models.Sequential([keras.layers.Embedding(vocab_size, 50, input_length = maxlen,
                                                        mask_zero=True),
                                 keras.layers.GlobalAveragePooling1D(),
                                 keras.layers.Dense(22,activation="relu"),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(11,activation="relu"),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(5, activation="softmax")
                                ])

model2.compile(loss="categorical_crossentropy", optimizer="adam",metrics=['accuracy'])
model2.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 50)           1481250   
_________________________________________________________________
global_average_pooling1d_3 ( (None, 50)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 22)                1122      
_________________________________________________________________
dropout_6 (Dropout)          (None, 22)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 11)                253       
_________________________________________________________________
dropout_7 (Dropout)          (None, 11)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 5)                

In [23]:
model2.fit(train_X, train_y,validation_data=(test_X,test_y),
            epochs=20, batch_size=32, steps_per_epoch= 55,
           validation_steps=32,validation_batch_size=13,
           workers=5)

model2.save("cnn-model-gloveless")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: cnn-model-gloveless/assets


In [55]:
weights = model2.layers[0].get_weights()[0]

word_vectors = {reverse_idx[i]:model2.layers[0].weights[0][i].numpy() for i in range(1, vocab_size)}

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word, vector in word_vectors.items():
    if not np.all(vector == 0):
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in vector]) + "\n")
out_v.close()
out_m.close()

In [56]:
def predict(text, model,verbose=True):
    if verbose:
        print("#####################################\nAnalyzing Statement:\n"+text)
    processed_text = process_text(remove_punctuation(text))
    tokenized_text = tokenizer.texts_to_sequences([processed_text])[0]
    padded_sequence = pad_sequences([tokenized_text],maxlen=maxlen,truncating="post",padding="post")
    likelihoods = model.predict(padded_sequence)[0]
    idx = np.argmax(likelihoods)
    highest_probability = likelihoods[idx]
    class_prediction = class_dict[idx]
    if verbose:
        print("\nClass:",class_prediction,"\nLikelihood:",str(highest_probability*100)+"%")
        print("#####################################\n\n")
    return class_prediction, highest_probability


In [57]:
predict("Liverpool wins the match!",model=model2)
predict("TV",model=model2)
predict("Democracy",model=model2)
predict("nvidia graphics card",model=model2)
predict("video driver",model=model2)
predict("luxury",model=model2)
predict("stocks",model=model2)
print("Done")

#####################################
Analyzing Statement:
Liverpool wins the match!

Class: sports 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
TV

Class: entertainment 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
Democracy

Class: politics 
Likelihood: 99.9991774559021%
#####################################


#####################################
Analyzing Statement:
nvidia graphics card

Class: tech 
Likelihood: 99.99639987945557%
#####################################


#####################################
Analyzing Statement:
video driver

Class: tech 
Likelihood: 99.95846152305603%
#####################################


#####################################
Analyzing Statement:
luxury

Class: business 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
stocks

Class: 