In [1]:
import keras
from sklearn.model_selection import train_test_split
import numpy as np
import os
import string
import re
import io
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from zipfile import ZipFile
import requests
import gzip

In [2]:
!wget "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
!wget "https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/blob/master/glove.6B.50d.txt.gz?raw=true" -O "glove.6B.50d.txt.gz"

--2021-07-31 17:14:20--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘bbc-fulltext.zip’


2021-07-31 17:14:23 (1.15 MB/s) - ‘bbc-fulltext.zip’ saved [2874078/2874078]

--2021-07-31 17:14:23--  https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/blob/master/glove.6B.50d.txt.gz?raw=true
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/raw/master/glove.6B.50d.txt.gz [following]
--2021-07-31 17:14:24--  https://github.com/kmr0877/IMDB-Sentiment-Classification-CBOW-Model/raw/master/glove.6B.50d.txt.gz
Reusing existing connection to github.com:443.
HTTP reque

In [3]:
ZipFile("bbc-fulltext.zip").extractall()

In [4]:
business_text_files = os.listdir("bbc/business")
entertainment_text_files = os.listdir("bbc/entertainment")
politics_text_files = os.listdir("bbc/politics")
tech_text_files = os.listdir("bbc/tech")
sports_text_files = os.listdir("bbc/sport")

In [5]:
def read_text(file,directory):
    file_path = directory + "/" + file
    #print(file_path)
    try:
        with open(file_path,'r') as f:
            text = f.read()

    # at least one file is ISO-8859-14 encoded. That could cause some issues unless accounted for
    except UnicodeDecodeError:
        with open(file_path,'r',encoding="ISO-8859-14") as f:
            text = f.read()
    return text

business_texts = [read_text(text_file,directory="bbc/business") for text_file in business_text_files]
entertainment_texts = [read_text(text_file,directory="bbc/entertainment") for text_file in entertainment_text_files]
politics_texts = [read_text(text_file,directory="bbc/politics") for text_file in politics_text_files]
tech_texts = [read_text(text_file,directory="bbc/tech") for text_file in tech_text_files]
sport_texts = [read_text(text_file,directory="bbc/sport") for text_file in sports_text_files]

In [6]:
all_texts = [business_texts, entertainment_texts, politics_texts, tech_texts, sport_texts]
text_idx = [0,1,2,3,4]
article_types = ["business","entertainment","politics","tech","sports"]
class_dict = dict(zip(text_idx,article_types))

In [7]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at",
             "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do",
             "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have",
             "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its",
             "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other",
             "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's",
              "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves",
             "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those",
             "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've",
             "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom",
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours",
             "yourself", "yourselves" ]

In [8]:
df = pd.DataFrame([[text,label] for (texts, label) in zip(all_texts,text_idx) for text in texts],columns=["text","label"])
df_train, df_test = train_test_split(df, train_size=.8,random_state=111)
df_train = df_train.copy()
df_test = df_test.copy()

In [9]:
def process_text(text):
    processed_text = " ".join([word for word in re.sub("\.+", ". ", re.sub("[\(\)\[\]\"\']","",text.replace("\n|\w+", " "))).split(" ") if word.lower().strip() not in stopwords])
    return processed_text

def remove_punctuation(text):
     return text.translate(str.maketrans('', '', string.punctuation))

In [10]:
all_sentences = [sentence.strip() for text in df_train.text.values for sentence in text.split(".") if sentence.strip() != ""]

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(all_sentences)

word_index = tokenizer.word_index
reverse_idx = {value :key for (key, value) in word_index.items()}

maxlen = 500

In [11]:
df_train['tokenized'] = df_train.text.apply(lambda text: tokenizer.texts_to_sequences([text])[0])
df_train["tokenized"] = [sequence for sequence in pad_sequences(df_train.tokenized.values,maxlen=maxlen,truncating="post",padding="post")]

df_test['tokenized'] = df_test.text.apply(lambda text: tokenizer.texts_to_sequences([text])[0])
df_test["tokenized"] = [sequence for sequence in pad_sequences(df_test.tokenized.values,maxlen=maxlen,truncating="post",padding="post")]
train_X = np.vstack(df_train["tokenized"].values)
test_X = np.vstack(df_test["tokenized"].values)

train_y = to_categorical(df_train.label.values)
test_y = to_categorical(df_test.label.values)

In [12]:
with gzip.open("glove.6B.50d.txt.gz", 'r') as f:
    embedding_list = f.read().decode("utf-8")

In [13]:
embedding_vectors = {}
for embedding_line in embedding_list.split("\n"):
    embedding_split = embedding_line.split(" ")
    embedding_vectors[embedding_split[0]] = embedding_split[1:]

vocab_size = len(word_index.keys()) + 1

embedding_matrix = np.zeros((vocab_size,50))

for word,i  in word_index.items():
    vector = embedding_vectors.get(word)
    if vector is not None:
        embedding_matrix[i,:] = vector

In [14]:
vocab_size = len(word_index.keys()) + 1

embedding_matrix = np.zeros((vocab_size,50))

for word,i  in word_index.items():
    vector = embedding_vectors.get(word)
    if vector is not None:
        embedding_matrix[i,:] = vector

In [15]:
## Model 1 ##

model1 = keras.models.Sequential([keras.layers.Embedding(vocab_size, 50, input_length = maxlen, weights= [embedding_matrix],
                                                        trainable=False, mask_zero=True),
                                 keras.layers.Conv1D(64, 10,activation='relu'),
                                 keras.layers.MaxPooling1D(4),
                                 keras.layers.Conv1D(128, 5,activation='relu'),
                                 keras.layers.GlobalAveragePooling1D(),
                                 keras.layers.Dense(20,activation="relu"),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(5, activation="softmax")
                                ])

model1.compile(loss="categorical_crossentropy", optimizer="adam",metrics=['accuracy'])
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           1481250   
_________________________________________________________________
conv1d (Conv1D)              (None, 491, 64)           32064     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 122, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 118, 128)          41088     
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 20)                2580      
_________________________________________________________________
dropout (Dropout)            (None, 20)                0

In [16]:
model1.fit(train_X, train_y,validation_data=(test_X,test_y),
            epochs=50, batch_size=32, steps_per_epoch= 55,validation_steps=32,validation_batch_size=13,
          workers=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fd356b32490>

In [17]:
## Model 2 ##
model2 = keras.models.Sequential([keras.layers.Embedding(vocab_size, 50, input_length = maxlen,
                                                        mask_zero=True),
                                 keras.layers.GlobalAveragePooling1D(),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(20,activation="relu"),
                                 keras.layers.Dropout(.2),
                                 keras.layers.Dense(5, activation="softmax")
                                ])

model2.compile(loss="categorical_crossentropy", optimizer="adam",metrics=['accuracy'])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 50)           1481250   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 50)                0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 20)                1020      
_________________________________________________________________
dropout_2 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 105       
Total params: 1,482,375
Trainable params: 1,482,375
Non-trainable params: 0
____________________________________________

In [18]:
model2.fit(train_X, train_y,validation_data=(test_X,test_y),
            epochs=50, batch_size=32, steps_per_epoch= 55,validation_steps=32,validation_batch_size=13,
          workers=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fd308220220>

In [19]:
weights = model2.layers[0].get_weights()[0]

word_vectors = {reverse_idx[i]:model2.layers[0].weights[0][i].numpy() for i in range(1, vocab_size)}

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word, vector in word_vectors.items():
    if not np.all(vector == 0):
        out_m.write(word + "\n")
        out_v.write('\t'.join([str(x) for x in vector]) + "\n")
out_v.close()
out_m.close()

In [20]:
def predict(text, model=model2,verbose=True):
    if verbose:
        print("#####################################\nAnalyzing Statement:\n"+text)
    processed_text = process_text(remove_punctuation(text))
    tokenized_text = tokenizer.texts_to_sequences([processed_text])[0]
    padded_sequence = pad_sequences([tokenized_text],maxlen=maxlen,truncating="post",padding="post")
    likelihoods = model.predict(padded_sequence)[0]
    idx = np.argmax(likelihoods)
    highest_probability = likelihoods[idx]
    class_prediction = class_dict[idx]
    if verbose:
        print("\nClass:",class_prediction,"\nLikelihood:",str(highest_probability*100)+"%")
        print("#####################################\n\n")
    return class_prediction, highest_probability


In [21]:
predict("Liverpool wins the match!")
predict("TV")
predict("Democracy")
predict("nvidia")
predict("video driver")
predict("luxury")
predict("stocks")
print("Done")

#####################################
Analyzing Statement:
Liverpool wins the match!

Class: sports 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
TV

Class: entertainment 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
Democracy

Class: politics 
Likelihood: 99.99970197677612%
#####################################


#####################################
Analyzing Statement:
nvidia

Class: politics 
Likelihood: 70.87598443031311%
#####################################


#####################################
Analyzing Statement:
video driver

Class: tech 
Likelihood: 99.99996423721313%
#####################################


#####################################
Analyzing Statement:
luxury

Class: business 
Likelihood: 100.0%
#####################################


#####################################
Analyzing Statement:
stocks

Class: business 