<b>Bidirectional Text Classification</b><br>
--
Coded by : Juna<br>
Dataset <a href="https://storage.googleapis.com/kaggle-data-sets/997253/1685568/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20201224%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20201224T043147Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=793ca606abe974c1815c1884970698a8e1870974a0fe2b43b142c7c1dd8974337b074c75b09b09c912db5d71195c40dcf1279c848c6acae1dc634cde3980f062afc155a02da0fe475e066e37dc09475d05f9b7fe58dc2ec702b3a7761a85e191d11316d9e74d1fa6efa0973741a3d56af8f569f6b0e06a7fbd191dc7c59cd3169e7cbebadb9d577d4508943bd655e2b37602d3fe6b67bd2160f9b4a7075c20105bff350e145fb15da0707dba69c36c3dfa222735d31c828849721031bea3a496d4ef88d3c8081f1c8fa03bf7e8ccb7f637a42de651d3c7f514b465c5f458bcc3129497ee106df6f26e509ec41f390988b9234bd1275fa4fca350c12a081123a4">link</a>

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

In [3]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
plt.style.use('ggplot')

In [5]:
# Define first plotting function!

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(16, 5))

    plt.subplot(1, 2, 1)
    plt.plot(x, acc, label='training accuracy')
    plt.plot(x, val_acc, label='validation accuracy')
    plt.title('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, label='training loss')
    plt.plot(x, val_loss, label='validation loss')
    plt.title('Loss')
    plt.legend()

In [6]:
df_newsgroup = pd.read_csv('20newsgroup_preprocessed.csv', sep=';', usecols=['target', 'text_cleaned'])
df_newsgroup.rename(columns={'text_cleaned' : 'text'}, inplace=True)

In [7]:
df_newsgroup.head()

Unnamed: 0,target,text
0,alt.atheism,atheist resources addresses atheist organizati...
1,alt.atheism,begin pgp signed message introduction atheism ...
2,alt.atheism,article charley wingate writes well john quite...
3,alt.atheism,kings become philosophers philosophers become ...
4,alt.atheism,article bob mcgwier writes however hate econom...


In [8]:
le = LabelEncoder()
le.fit(df_newsgroup['target'].unique())

LabelEncoder()

In [9]:
df_newsgroup['target'] = le.transform(df_newsgroup['target'])
df_newsgroup.head()

Unnamed: 0,target,text
0,0,atheist resources addresses atheist organizati...
1,0,begin pgp signed message introduction atheism ...
2,0,article charley wingate writes well john quite...
3,0,kings become philosophers philosophers become ...
4,0,article bob mcgwier writes however hate econom...


In [10]:
# Split them!
X = df_newsgroup['text'].astype(str)
y = tf.keras.utils.to_categorical(df_newsgroup['target'], num_classes=df_newsgroup['target'].nunique())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_newsgroup['target'])

In [11]:
# Tokenize
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

In [12]:
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

In [15]:
# Padding

max_length = len(max(train_seq, key=len))

train_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=max_length, padding='post', truncating='post')
test_vector = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=max_length, padding='post', truncating='post')

In [16]:
class CallBacks(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        acc_threshold = 0.9
        if logs.get('accuracy') > acc_threshold:
            print(f"\nReached {acc_threshold} accuracy, cancelling training")
            self.model.stop_training = True

In [17]:
# Set up your model!

def model(vocab_size, max_length):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(20, activation='softmax')
    ])
    
    return model

In [18]:
model = model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 17617, 64)         8337344   
_________________________________________________________________
bidirectional (Bidirectional (None, 17617, 128)        66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 20)                1300      
Total params: 8,450,068
Trainable params: 8,450,068
Non-trainable params: 0
______________________________________________

In [21]:
# Epochs must 40

history = model.fit(train_vector, y_train, epochs=1, validation_data=(test_vector, y_test), callbacks=[CallBacks()])



In [23]:
loss, accuracy = model.evaluate(train_vector, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(test_vector, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.7381
Testing Accuracy:  0.5985


In [25]:
predictions = model.predict_classes(test_vector)
ground_truth = np.argmax(y_test, axis=1)

In [26]:
list_precision = []
list_recall = []
list_f1 = []
for precision, target_class in zip(precision_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None), le.classes_):
    list_precision.append({'target' : target_class, 'precision' : precision})
    
for recall in recall_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None):
    list_recall.append(recall)
    
for recall in f1_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None):
    list_f1.append(recall)
        
df_metrics = pd.DataFrame(list_precision)
df_metrics['recall'] = list_recall
df_metrics['f1_score'] = list_f1

In [27]:
df_metrics = round(df_metrics, 2)
df_metrics.sort_values('f1_score', ascending=False)

Unnamed: 0,target,precision,recall,f1_score
10,rec.sport.hockey,0.89,0.78,0.83
11,sci.crypt,0.83,0.8,0.81
17,talk.politics.mideast,0.88,0.69,0.78
5,comp.windows.x,0.7,0.76,0.73
7,rec.autos,0.81,0.64,0.71
9,rec.sport.baseball,0.69,0.71,0.7
14,sci.space,0.59,0.74,0.65
12,sci.electronics,0.61,0.64,0.62
15,soc.religion.christian,0.5,0.82,0.62
6,misc.forsale,0.69,0.55,0.61
