# Final TP - NLP
## Classification des documents du procès des groupes américains du tabac

In [2]:
import numpy as np
import matplotlib as plt
%matplotlib inline
import os
from sklearn.model_selection import train_test_split

In [3]:
path = "Tobacco3482-OCR/"
classes = os.listdir(path)

nb = []
x = []
y = []
for cls in classes:
    files = os.listdir(path + cls)
    for file in files:
        with open(path + cls + "/" + file, 'r') as f:
            txt = f.read()
        x.append(txt)
        y.append(cls)
    nb.append(str(len(files)))
print(str(nb))
x = np.array(x)
y = np.array(y)

['120', '265', '620', '599', '261', '567', '230', '431', '188', '201']


## 2. Analyse du problème 
Le problème que nous sommes censés résoudre est de fournir une solution permettant de classer les documents en plusieurs classes.

In [4]:
# To replace the \n with space
for i in range(x.shape[0]):
    x[i] = x[i].replace("\n", " ")
# print(x)
x_token = []
for text in x:
    tokens = text.split()
    x_token.append(tokens)
#print(x[0])
#print(x_token[0])

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_token, y, test_size=0.2)

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
# Create document vectors
# YOUR CODE HERE
vectorizer = CountVectorizer(max_features=2000)
vectorizer.fit(x_train)
x_train_counts = vectorizer.transform(x_train)
x_test_counts = vectorizer.transform(x_test)

from sklearn.feature_extraction.text import TfidfTransformer
# With TF-IDF representation
tf_transformer = TfidfTransformer()
tfidf = tf_transformer.fit(x_train_counts)
x_train_tf = tfidf.transform(x_train_counts)
x_test_tf = tfidf.transform(x_test_counts)


In [10]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train_tf, y_train)
res = clf.score(x_test_tf, y_test)
print('Accuray of NB: ' + str(res))

Accuray of NB: 0.6857962697274032


In [11]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
# YOUR CODE HERE
y_pred = clf.predict(x_test_tf)
report = classification_report(y_test, y_pred)
print(report)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

               precision    recall  f1-score   support

Advertisement       0.59      0.39      0.47        33
        Email       0.85      0.95      0.90       110
         Form       0.65      0.81      0.72        91
       Letter       0.57      0.79      0.66       109
         Memo       0.60      0.77      0.67       141
         News       0.91      0.64      0.75        33
         Note       1.00      0.02      0.04        49
       Report       0.73      0.16      0.26        50
       Resume       0.96      1.00      0.98        23
   Scientific       0.80      0.67      0.73        58

    micro avg       0.69      0.69      0.69       697
    macro avg       0.77      0.62      0.62       697
 weighted avg       0.72      0.69      0.65       697

[[ 13   2   8   4   5   1   0   0   0   0]
 [  0 105   0   3   2   0   0   0   0   0]
 [  2   0  74   4   9   0   0   0   0   2]
 [  0   1   0  86  20   0   0   0   0   2]
 [  1   6   3  23 108   0   0   0   0   0]
 [  1   0   

In [39]:
import ast
import os
from nn_utils import TrainingHistory
from keras.layers import Dense, Embedding, Input
from keras.layers import GRU, Dropout, MaxPooling1D, Conv1D, Flatten, Reshape
from keras.models import Model
import numpy as np
import itertools
from keras.utils import np_utils
from sklearn.metrics import (classification_report, 
                             precision_recall_fscore_support, 
                             accuracy_score)

from keras.preprocessing import text, sequence

In [40]:
# Model parameters
MAX_FEATURES = 10000
MAX_TEXT_LENGTH = 2000
#EMBED_SIZE  = 300
BATCH_SIZE = 16
EPOCHS = 10
VALIDATION_SPLIT = 0.1
NB_CLASS = len(np.unique(y_train))

In [61]:
x_train_tf = x_train_tf.toarray()
x_train_tf = np.reshape(x_train_tf, (x_train_tf.shape[0], x_train_tf.shape[1], 1))
x_test_tf = np.reshape(x_test_tf.toarray(), (x_test_tf.shape[0], x_test_tf.shape[1], 1))


In [63]:
def get_train_test(train_raw_text, test_raw_text):
    
    tokenizer = text.Tokenizer(num_words=MAX_FEATURES)

    tokenizer.fit_on_texts(list(train_raw_text))
    train_tokenized = tokenizer.texts_to_sequences(train_raw_text)
    test_tokenized = tokenizer.texts_to_sequences(test_raw_text)
    return sequence.pad_sequences(train_tokenized, maxlen=MAX_TEXT_LENGTH), \
           sequence.pad_sequences(test_tokenized, maxlen=MAX_TEXT_LENGTH)



def get_model():

    inp = Input(shape=(MAX_TEXT_LENGTH,1))
    #model = Embedding(MAX_FEATURES, EMBED_SIZE)(inp)
    model = Dropout(0.5)(inp)
    model = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=2)(model)
    model = Flatten()(model)
    model = Dense(1024, activation='relu')(model)
    model = Dropout(0.5)(model)
    model = Dense(NB_CLASS, activation="softmax")(model)
    model = Model(inputs=inp, outputs=model)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model


def train_fit_predict(model, x_train, x_test, y, history):
    
    model.fit(x_train, y,
              batch_size=BATCH_SIZE,
              epochs=EPOCHS, verbose=1,
              validation_split=VALIDATION_SPLIT)

    return model.predict(x_test)


# Get the list of different classes
CLASSES_LIST = np.unique(y_train)
n_out = len(CLASSES_LIST)
print(CLASSES_LIST)

# Convert clas string to index
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(CLASSES_LIST)
y_train = le.transform(y_train) 
y_test = le.transform(y_test) 
train_y_cat = np_utils.to_categorical(y_train, n_out)

# get the textual data in the correct format for NN
#x_vec_train, x_vec_test = get_train_test(x_train, x_test)
#print(len(x_vec_train), len(x_vec_test))

# define the NN topology
model = get_model()

# Define training procedure
history = TrainingHistory(x_test_tf, y_test, CLASSES_LIST)

# Train and predict
y_predicted = train_fit_predict(model, x_train_tf, x_test_tf, train_y_cat, history).argmax(1)


print("Test Accuracy:", accuracy_score(y_test, y_predicted))

p, r, f1, s = precision_recall_fscore_support(y_test, y_predicted, 
                                              average='micro',
                                              labels=[x for x in 
                                                      np.unique(y_train) 
                                                      if x not in ['CSDECMOTV']])

print('p r f1 %.1f %.2f %.3f' % (np.average(p, weights=s)*100.0, 
                                 np.average(r, weights=s)*100.0, 
                                 np.average(f1, weights=s)*100.0))


print(classification_report(y_test, y_predicted, labels=[x for x in 
                                                       np.unique(y_train) 
                                                       if x not in ['CSDECMOTV']]))

[0 1 2 3 4 5 6 7 8 9]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_15 (InputLayer)        (None, 2000, 1)           0         
_________________________________________________________________
dropout_13 (Dropout)         (None, 2000, 1)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 2000, 32)          96        
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 1000, 32)          0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 32000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 1024)              32769024  
_________________________________________________________________
dropout_14 (Dropout)         (None, 1024)             

In [None]:
def get_train_test(train_raw_text, test_raw_text):
    
    tokenizer = text.Tokenizer(num_words=MAX_FEATURES)

    tokenizer.fit_on_texts(list(train_raw_text))
    train_tokenized = tokenizer.texts_to_sequences(train_raw_text)
    test_tokenized = tokenizer.texts_to_sequences(test_raw_text)
    return sequence.pad_sequences(train_tokenized, maxlen=MAX_TEXT_LENGTH), \
           sequence.pad_sequences(test_tokenized, maxlen=MAX_TEXT_LENGTH)



def get_model_1():

    inp = Input(shape=(MAX_TEXT_LENGTH,))
    model = Embedding(MAX_FEATURES + 1,
                      EMBED_SIZE,
                      weights=[embedding_matrix],
                      input_length=MAX_TEXT_LENGTH,
                      trainable=False)(inp)
    model = Dropout(0.5)(inp)
    model = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(model)
    model = MaxPooling1D(pool_size=2)(model)
    model = Flatten()(model)
    model = Dense(1024, activation='relu')(model)
    model = Dropout(0.5)(model)
    model = Dense(NB_CLASS, activation="softmax")(model)
    model = Model(inputs=inp, outputs=model)
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    return model

In [64]:
EMBED_SIZE = len(x_train)

x_vec_train, x_vec_test = get_train_test(x_train, x_test)
model = get_model()

# Define training procedure
history = TrainingHistory(x_vec_test, y_test, CLASSES_LIST)

# Train and predict
y_predicted = train_fit_predict(model, x_vec_train, x_vec_test, train_y_cat, history).argmax(1)


print("Test Accuracy:", accuracy_score(y_test, y_predicted))

p, r, f1, s = precision_recall_fscore_support(y_test, y_predicted, 
                                              average='micro',
                                              labels=[x for x in 
                                                      np.unique(y_train) 
                                                      if x not in ['CSDECMOTV']])

print('p r f1 %.1f %.2f %.3f' % (np.average(p, weights=s)*100.0, 
                                 np.average(r, weights=s)*100.0, 
                                 np.average(f1, weights=s)*100.0))


print(classification_report(y_test, y_predicted, labels=[x for x in 
                                                       np.unique(y_train) 
                                                       if x not in ['CSDECMOTV']]))