AI - TP1_2

Bastien SAUVAT et Bastien FAISANT

# Exercise 3 : Text classification on the Ohsumed dataset

*Objective : The goal of this exercise is to realize a text classifier using deep neural networks. Your task
is to construct a classifier, using the available training set, and evaluate it using the test set. The classifier
should predict the category for the articles.*

In [121]:
import os
from collections import defaultdict
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
import pandas as pd
import numpy as np
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, Dropout, GlobalAveragePooling1D, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
from tensorflow.keras import regularizers
import re
import string
import matplotlib.pyplot as plt

## Data parsing

In [40]:
def get_info(path: str):
    data = list(os.walk(path))[1:]
    files = []
    for d in data:
        folder_name = d[0]
        for file in d[2]:
            files.append((folder_name.split('/')[-1], os.path.join(folder_name, file)))

    d = defaultdict(int)
    texts = defaultdict(list)
    for (cate, file) in files:
        with open(file, 'r') as outfile:
            text = outfile.read()
            texts[cate].append(text)
            words = text_to_word_sequence(text)
            for word in words:
                d[word] += 1
    words = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return (texts, words)

In [41]:
training_texts, training_words = get_info("./data/ohsumed-first-20000-docs/training/")
test_texts, test_words = get_info("./data/ohsumed-first-20000-docs/test/")

In [42]:
def get_df(dataset: defaultdict[any, list]):
    classes = []
    texts = []
    for classe, liste_texts in dataset.items():
        for text in liste_texts:
            texts.append(text)
            classes.append(classe)

    df = pd.DataFrame({'Classes': classes, 'Texts': texts})
    return df


In [43]:
train_set = get_df(training_texts)
test_set = get_df(test_texts)

## Data exploration

In [63]:
train_set.head()

Unnamed: 0,Classes,Texts
0,C01,Augmentation mentoplasty using Mersilene mesh....
1,C01,Multiple intracranial mucoceles associated wit...
2,C01,Replacement of an aortic valve cusp after neon...
3,C01,The value of indium 111 leukocyte scanning in ...
4,C01,Febrile infants less than eight weeks old. Pre...


In [66]:
train_set.info()
train_set.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10433 entries, 0 to 10432
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Classes  10433 non-null  object
 1   Texts    10433 non-null  object
dtypes: object(2)
memory usage: 163.1+ KB


Unnamed: 0,Classes,Texts
count,10433,10433
unique,23,6286
top,C23,Magnetic resonance imaging of radiation optic ...
freq,1799,6


In [67]:
test_set.info()
test_set.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12733 entries, 0 to 12732
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Classes  12733 non-null  object
 1   Texts    12733 non-null  object
dtypes: object(2)
memory usage: 199.1+ KB


Unnamed: 0,Classes,Texts
count,12733,12733
unique,23,7643
top,C23,The butterfly rash and the malar flush. What d...
freq,2153,7


## Pre-processing

In [45]:
english_stops = set(stopwords.words('english'))

In [111]:
def convert_classes_to_integers(classes):
    unique_classes = classes.unique()
    class_mapping = {cls: int(cls[1:]) for cls in unique_classes}
    return classes.replace(class_mapping)

In [112]:
def load_dataset(texts: defaultdict[any, list]):
    df = get_df(texts)

    x_data = df['Texts']
    y_data = df['Classes']

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    

    # Replace class name by their number
    y_data = convert_classes_to_integers(y_data)

    return x_data, y_data

In [113]:
x_train, y_train = load_dataset(training_texts)
x_test, y_test = load_dataset(test_texts)

In [114]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [115]:
# ENCODE REVIEW
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 5193 14808    77 ...     0     0     0]
 [  211  1075  8608 ...     0     0     0]
 [  553   218   365 ...     0     0     0]
 ...
 [ 4991  4992  2104 ...     0     0     0]
 [   20  5924   763 ...  1854   216  2528]
 [   86   422   553 ...     0     0     0]] 

Encoded X Test
 [[ 3705  2377  1165 ...     0     0     0]
 [  784 12827  1953 ...  1619  1710  2150]
 [  699    96  2161 ...   300   400   160]
 ...
 [  479  1571  4739 ...   194    77 10236]
 [  184   163   279 ...   346   184   279]
 [  835  7339  2064 ...     0     0     0]] 

Maximum review length:  112


## Create the model

In [122]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

# model = Sequential()
# model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
# model.add(LSTM(LSTM_OUT))
# #todo : modify activation and/or optimizer to improve accuracy
# model.add(Dense(1, activation='sigmoid'))
# model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# print(model.summary())

model = tf.keras.Sequential([
  Embedding(total_words, EMBED_DIM, input_length = max_length),
  Dropout(0.2),
  GlobalAveragePooling1D(),
  Dropout(0.2),
  Dense(1)])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 112, 32)           905888    
                                                                 
 dropout (Dropout)           (None, 112, 32)           0         
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 905,921
Trainable params: 905,921


Non-trainable params: 0
_________________________________________________________________


## Train the Model

In [125]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [126]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [127]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5


Epoch 1: accuracy improved from -inf to 0.00000, saving model to models\LSTM.h5
Epoch 2/5
Epoch 2: accuracy improved from 0.00000 to 0.00220, saving model to models\LSTM.h5
Epoch 3/5
Epoch 3: accuracy improved from 0.00220 to 0.03077, saving model to models\LSTM.h5
Epoch 4/5
Epoch 4: accuracy improved from 0.03077 to 0.04054, saving model to models\LSTM.h5
Epoch 5/5
Epoch 5: accuracy did not improve from 0.04054


<keras.callbacks.History at 0x2aa1e24c970>

## Testing

In [119]:
y_pred = model.predict(x_test, batch_size = 128)
y_pred = np.round(y_pred)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 506
Wrong Prediction: 12227
Accuracy: 3.973926019005733
