In [12]:
import os
import pandas as pd
import numpy as np

topics_init = [x for x in os.listdir("./bbc-fulltext/bbc") if x != "README.TXT"]

In [24]:
article_list = {"heading":[], "topic":[]}

for x in topics_init:
    for article_title in os.listdir(os.path.join("./bbc-fulltext/bbc", x)):
        with open(os.path.join("./bbc-fulltext/bbc", x, article_title), "r") as f:
            heading = f.readline()
            article_list["heading"].append(heading.split("\n")[0])
            article_list["topic"].append(x)
article_df = pd.DataFrame(article_list)
article_df["heading"].head()

0    Ad sales boost Time Warner profit
1     Dollar gains on Greenspan speech
2    Yukos unit buyer faces loan claim
3    High fuel prices hit BA's profits
4    Pernod takeover talk lifts Domecq
Name: heading, dtype: object

In [31]:
from keras.src.utils import pad_sequences
from keras.src.preprocessing.text import Tokenizer

headings_arr = article_df["heading"].values
heading_tokenizer = Tokenizer(oov_token="<OOV>")
heading_tokenizer.fit_on_texts(headings_arr)

sequenced_headings = heading_tokenizer.texts_to_sequences(headings_arr)
padded_heading = pad_sequences(sequenced_headings, padding="post", truncating="post", maxlen = 10)

Ad sales boost Time Warner profit


In [33]:
from sklearn.preprocessing import LabelEncoder

topic_encoder = LabelEncoder()
ys = topic_encoder.fit_transform(article_df["topic"].values)
ys[:5]

array([0, 0, 0, 0, 0])

In [34]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(padded_heading, ys, train_size=0.8, random_state=4)

In [55]:
from keras.src.layers import Embedding, Bidirectional, Dropout, LSTM, Conv1D, GlobalAveragePooling1D, Dense
from keras import Sequential

model = Sequential([
    Embedding(len(heading_tokenizer.word_index)+1, 32, input_length = 10),
    Dropout(0.2),
    Conv1D(64, 4, activation="relu"),
    # Bidirectional(LSTM(32)),
    GlobalAveragePooling1D(),
    Dense(32, activation="relu"),
    Dense(5, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [56]:
model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1d37580ef90>

In [86]:
padded_input = pad_sequences(heading_tokenizer.texts_to_sequences([""]), maxlen=10, padding="post", truncating="post")
print(padded_input)
prediction = model.predict(padded_input)
print(topic_encoder.inverse_transform([np.argmax(prediction)]))

[[284   1   1   0   0   0   0   0   0   0]]
['sport']


In [88]:
# MODEL 2 Based on pretrained GloVe embeddings

glove_embeddings = {}
with open("./bbc-fulltext/glove.twitter.27B.100d.txt", "r",encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefficients = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = coefficients


In [93]:
word_index = heading_tokenizer.word_index
embedding_mtx = np.zeros((len(word_index)+1, 100))
for word, i in word_index.items():
    embedding = glove_embeddings.get(word)
    if embedding is not None:
        embedding_mtx[i] = embedding

In [99]:
model_v2 = Sequential([
    Embedding(len(heading_tokenizer.word_index)+1, 100, input_length = 10, weights = [embedding_mtx], trainable=False),
    Dropout(0.2),
    Conv1D(64, 4, activation="relu"),
    # Bidirectional(LSTM(32)),
    GlobalAveragePooling1D(),
    Dense(32, activation="relu"),
    Dense(5, activation="softmax")
])

model_v2.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [120]:
model_v2.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x1d3a48a10d0>

In [144]:
padded_input_v2 = pad_sequences(heading_tokenizer.texts_to_sequences(["UK economy grows by 0.2%, ONS figures show"]), maxlen=10, padding="post", truncating="post")
prediction_v2 = model_v2.predict(padded_input_v2)
print(prediction_v2)
# print(("Probability: ", np.exp(np.max(prediction_v2)))/np.sum(np.exp(prediction_v2)) * 100 // 1, "%")
print(topic_encoder.inverse_transform([np.argmax(prediction_v2)]))

[[9.9087042e-01 5.5080331e-05 5.4712343e-04 1.3137650e-07 8.5272351e-03]]


TypeError: ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''