In [29]:
# Loading drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%tensorflow_version 2.x
import tensorflow as tf
print(tf.__version__)

2.7.0


In [3]:
# Setting the working directory 
!ls
%cd drive/My\ Drive/ML_Assignment
!pwd

drive  sample_data
/content/drive/My Drive/ML_Assignment
/content/drive/My Drive/ML_Assignment


In [4]:
# Downloading the data
!wget https://www.dropbox.com/s/tp3l54tnatvbldf/bbc.csv?dl=0 -O 'bbc.csv'

--2021-11-19 15:02:14--  https://www.dropbox.com/s/tp3l54tnatvbldf/bbc.csv?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.80.18, 2620:100:601b:18::a27d:812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.80.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/tp3l54tnatvbldf/bbc.csv [following]
--2021-11-19 15:02:15--  https://www.dropbox.com/s/raw/tp3l54tnatvbldf/bbc.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc011dd9547997ca36b2ec7feb39.dl.dropboxusercontent.com/cd/0/inline/BaRlaSHaoWJ0BGN7_ectOQNhLLE2MnH_e-BZ-WrJYfSQxnkjKliyPMomNXpcyvm3qJWTZ7d0fM8CNAApBYVRehz5IIwXc72H6Y25CPlnJ7WQezMqzQvKQ7bse12cbG7Cv86JaWrU6OfPMPy_UGt1D9DZ/file# [following]
--2021-11-19 15:02:16--  https://uc011dd9547997ca36b2ec7feb39.dl.dropboxusercontent.com/cd/0/inline/BaRlaSHaoWJ0BGN7_ectOQNhLLE2MnH_e-BZ-WrJYfSQxnkjKliyPMomNXpcyvm3qJWTZ7d0fM8CNAApBYVRehz5IIwXc72H6Y

In [21]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from tensorflow.keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation, Add

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [6]:
# loading data
df = pd.read_csv("bbc.csv")
print(df.columns)
le = LabelEncoder()
input_labels = le.fit_transform(df['Class'])
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)
print(df.head())

Index(['Unnamed: 0', 'Article', 'Class'], dtype='object')
{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}
   Unnamed: 0                                            Article     Class
0           0  Ad sales boost Time Warner profit\n\nQuarterly...  business
1           1  Dollar gains on Greenspan speech\n\nThe dollar...  business
2           2  Yukos unit buyer faces loan claim\n\nThe owner...  business
3           3  High fuel prices hit BA's profits\n\nBritish A...  business
4           4  Pernod takeover talk lifts Domecq\n\nShares in...  business


In [7]:
# Pre-processing data
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.porter import PorterStemmer
import re
porter_stemmer = PorterStemmer()
doc_list = df['Article'].tolist()
processed_list = []
for doc in doc_list:
  doc_str = re.sub(r'[^\w\s]', '', remove_stopwords(doc))
  doc_str_stem_lst = [porter_stemmer.stem(word) for word in doc_str.split(" ")]
  doc_str_stem = " ".join(doc_str_stem_lst)
  processed_list.append(doc_str_stem)

In [11]:
# Defining the tokenizer
def get_tokenizer(data):
  print('Training tokenizer...')
  tokenizer = Tokenizer()
  print('Read {} Sentences'.format(len(data)))
  tokenizer.fit_on_texts(data)
  return tokenizer

In [9]:
def get_data(tokenizer, MAX_LENGTH, input_data, input_labels):
  print('Loading data')
  
  assert len(input_data) == len(input_labels)
  sequences = tokenizer.texts_to_sequences(input_data)
  X = pad_sequences(sequences, maxlen=MAX_LENGTH)
  Y_bcc = np.array(input_labels)

  return X, Y_bcc

In [12]:
tokenizer = get_tokenizer(processed_list)

Training tokenizer...
Read 1912 Sentences


In [14]:
MAX_LENGTH = 50
# read ml data
X, Y_bcc = get_data(tokenizer, MAX_LENGTH, processed_list, input_labels)

Loading data


In [18]:
# Creating one-hot encodings
y_bcc_labels = keras.utils.np_utils.to_categorical(Y_bcc)
print(y_bcc_labels)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [24]:
# Splitting data into train, val and test
train1_X, test_X, train1_Y, test_Y = train_test_split(X, y_bcc_labels, test_size=0.2, random_state=43)
train_X, val_X, train_Y, val_Y = train_test_split(train1_X, train1_Y, test_size=0.1, random_state=43)

In [25]:
# Setting hyper-parameters
NUM_CLASSES = 5

MAX_SEQUENCE_LENGTH = MAX_LENGTH

MAX_NUM_WORDS = len(tokenizer.word_index) + 1

NUM_EMBEDDING_DIM = 100

In [63]:
# Defining the model
print('Getting Text FF NN')
input_layer = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM, trainable=True)
embedded_input = embedding_layer(input_layer)
flatten_input = Flatten()(embedded_input)
dense_layer_1 = Dense(2048, activation='relu')
drop_1 = Dropout(0.1)
dense_layer_2 = Dense(1024, activation='relu')
drop_2 = Dropout(0.1)
dense_layer_3 = Dense(512, activation='relu')
drop_3 = Dropout(0.1)
dense_layer_4 = Dense(5, activation='softmax')
output_layer = dense_layer_4(drop_3(dense_layer_3(drop_2(dense_layer_2(drop_1(dense_layer_1(flatten_input)))))))
model = Model(
    inputs=input_layer, 
    outputs=output_layer)
print(model.summary())

Getting Text FF NN
Model: "model_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_14 (InputLayer)       [(None, 50)]              0         
                                                                 
 embedding_13 (Embedding)    (None, 50, 100)           2253200   
                                                                 
 flatten_13 (Flatten)        (None, 5000)              0         
                                                                 
 dense_48 (Dense)            (None, 2048)              10242048  
                                                                 
 dropout_28 (Dropout)        (None, 2048)              0         
                                                                 
 dense_49 (Dense)            (None, 1024)              2098176   
                                                                 
 dropout_29 (Dropout)        (None, 102

In [64]:
from tensorflow.keras.optimizers import Adam
lr = 1e-3
opt = Adam(learning_rate=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [65]:
BATCH_SIZE = 512
NUM_EPOCHS = 100
stop = [EarlyStopping(monitor='val_loss', patience=1)]
history = model.fit(x=train_X,
                    y=train_Y,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      val_X, 
                      val_Y
                    ),
                    shuffle=True,
                    callbacks=stop,
          )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


In [66]:
# Getting predictions
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(test_X)

In [72]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
y_true = [idx for idx in np.argmax(test_Y, axis=1)]
print('BBC News Accuracy is')
print(metrics.accuracy_score(y_true, y_pred)*100)
print(classification_report(y_true, y_pred, target_names = list(le_name_mapping.keys())))

BBC News Accuracy is
57.180156657963444
               precision    recall  f1-score   support

     business       0.55      0.80      0.65        79
entertainment       0.59      0.68      0.64        69
     politics       0.75      0.56      0.64        93
        sport       0.28      0.38      0.32        47
         tech       0.71      0.41      0.52        95

     accuracy                           0.57       383
    macro avg       0.58      0.57      0.55       383
 weighted avg       0.61      0.57      0.57       383

