## News Classification using NLP

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Reading News Articles Dataset

In [2]:
import os
dataset = 'bbc'

categories = []
text = []
labels = []

for news_group in os.listdir(dataset):
    categories.append(news_group)
    article_path = os.path.join(dataset, news_group)
    try:
        for filename in os.listdir(article_path):
            news_article = os.path.join(article_path, filename)
            with open(news_article, 'r', encoding='utf-8', errors='ignore') as file:
                text.append(file.read())
                labels.append(news_group)
    except:
        continue

### Converting news articles to numerical array

In [3]:
# Tokenizing and Pad Sequencing
from keras.preprocessing.text import Tokenizer
from keras.utils.data_utils import pad_sequences

tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)
data = pad_sequences(sequences, maxlen=1000)
data

array([[    0,     0,     0, ...,     3,     9,  1339],
       [    0,     0,     0, ...,     5,  2045,  2369],
       [    0,     0,     0, ...,     6,     1,   442],
       ...,
       [  605,    21,  2431, ...,   469,   142,   226],
       [    0,     0,     0, ..., 15968,     6,  2788],
       [   44,     4,    57, ...,    13,     1,   373]])

### Converting news categories into numbers

In [4]:
# Converting Lable to Arrays
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
labels

array([0, 0, 0, ..., 4, 4, 4], dtype=int64)

### Splitting Dataset into Train and Test

In [5]:
# Splitting data to train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

### Building LSTM Model to classify news articles

In [6]:
# Building LSTM Model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

model = Sequential()
model.add(Embedding(20000, 128, input_length=1000))
model.add(LSTM(128))
model.add(Dense(20, activation='softmax'))
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Training Model
model.fit(X_train, y_train, validation_split=0.1, epochs=20, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15533556610>

### Check Model Classification Accuracy

In [7]:
# Evaluating Model
scores = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Accuracy: {scores[1]*100:.2f}%')

Test Accuracy: 84.13%


## Sample Prediction Example

In [8]:
# Prediction Example
import numpy as np

test_article_index = np.random.randint(len(X_test))
test_article = X_test[test_article_index]
test_article_text = text[test_article_index]
tokens = tokenizer.texts_to_sequences([test_article_text])
test_article_processed = pad_sequences(tokens, maxlen=1000)

predicted_category_index = model.predict(test_article_processed).argmax(axis=-1)
predicted_category_name = categories[predicted_category_index[0]]

print(f'\nArticle: \n{test_article_text}')
print(f'\nPredicted Category: {predicted_category_name}')


Article: 
S Korean credit card firm rescued

South Korea's largest credit card firm has averted liquidation following a one trillion won ($960m; £499m) bail-out.

LG Card had been threatened with collapse because of its huge debts but the firm's creditors and its former parent have stepped in to rescue it. A consortium of creditors and LG Group, a family owned conglomerate, have each put up $480m to stabilise the firm. LG Card has seven million customers and its collapse would have sent shockwaves through the country's economy.

The firm's creditors - which own 99% of LG Card - have been trying to agree a deal to secure its future for several weeks. They took control of the company in January when it avoided bankruptcy only through a $4.5bn bail-out.

They had threatened to delist the company, a move which would have triggered massive debt redemptions and forced the company into bankruptcy, unless agreement was reached on its future funding. "LG Card will not need any more financial a

### Put your own article to test

In [9]:
# with open('file_path', 'r', encoding='utf-8') as file:
#     test_article = file.read()

# tokenized = tokenizer.texts_to_sequences([test_article])
# processed = pad_sequences(tokenized, maxlen=1000)

# predicted_category_index = model.predict(processed).argmax(axis=-1)
# predicted_category_name = categories[predicted_category_index[0]]

# print(f'\nArticle: \n{test_article}')
# print(f'\nPredicted Category: {predicted_category_name}')