# 1D Convolutional Neuronal Network Classifier

This Model I mostly use for Image recognition, but some papers say that the one-dimension version could be very interesting to NLP Classifier.

The main problem of this algorithm is that it takes many calcul to train the network and my computer is not powerful enough to deal with this kind of training...

I think that we can get very good results with this algorithm but it will be very hard to configurate it (find the good hyperparameters to avoid overfeeding) and our other algorithm already works well...


## Import Library

To Run this algo you will need to download Tonserflow, thead and Keras in your computer.
you will also need to download the Glove embeding vectors

In [None]:
from __future__ import print_function, division
import numpy as np
import pandas as pd
import re
import nltk

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Convolution1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Hyperparameters

In [None]:
#Preprocessing Hyperpara:
Taille_Data = 9600 # Number of sample considerate for training and test
Word_Dimention = 100 # dimension of GLOVE words 50 , 100 , 200
Max_Doc_Lenght = 1000  # troncaturate doc with more than 'Max_Doc_Lenght' words 
Max_NB_Word = 80000 # Only considerate the 10 000 more common words (110 000 actually)
r_Split=0.2 # Proportion of test set size

# CNN Hyperpara :
Nb_Filter = 128
Last_Hlayer_Size = 128
Batch_Size = 128
Nb_Epochs = 5

## Import Data

In [None]:
#Read Data
dataset_Healthy = pd.read_excel('Funds Articles_Healthy_New.xlsx', dtype={'Name':str}, quoting = 3)
dataset_Unhealthy = pd.read_excel('Fund Articles_Unhealthy_New.xlsx', dtype={'Name':str}, quoting = 3)

#Rename Columns
dataset_Healthy.columns = ['Articles', 'Labels']
dataset_Unhealthy.columns = ['Articles', 'Labels']

#Create Labels
dataset_Healthy['Labels']=1
dataset_Unhealthy['Labels']=0

#Schuffle
dataset_Healthy = dataset_Healthy.sample(frac=1).reset_index(drop=True)
dataset_Unhealthy = dataset_Unhealthy.sample(frac=1).reset_index(drop=True)

#Troncaturate
dataset_Healthy = dataset_Healthy[:min(int(Taille_Data/2),len(dataset_Healthy))]
dataset_Unhealthy = dataset_Unhealthy[:min(int(Taille_Data/2),len(dataset_Unhealthy))]

#Concatenate
data = pd.concat([dataset_Healthy,dataset_Unhealthy])
data = data.sample(frac=1).reset_index(drop=True)

Data_train =  Data[:int(Taille_Data*(1-r_Split))].reset_index(drop=True)
Data_test =  Data[int(Taille_Data*(1-r_Split)):Size_Data].reset_index(drop=True)

## Preprocess train Data

In [None]:

# Tokenizer function
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set(stopwords.words('english'))
def my_tokenizer(s):
    s = re.sub('[^1-9a-zA-Z]', ' ', s)
    s = s.lower() 
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [t for t in tokens if len(t) > 2] 
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] 
    tokens = [t for t in tokens if t not in stopwords] 
    return tokens

# Map the Glove
word2vec = {}
with open('glove.6B.'+str(Word_Dimention)+'d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

#Create corpus an tokenize
corpus_train=[]
for i in range (0,int(min(Size_Data,len(Data))*(1-r_split))):
    tokens = my_tokenizer(data_train['Articles'][i])
    tokens = ' '.join(tokens)
    corpus_train.append (tokens)

#Map the token and build input
tokenizer_train = Tokenizer(num_words=Max_NB_Word)
tokenizer_train.fit_on_texts(corpus_train)
sequences_train = tokenizer_train.texts_to_sequences(corpus_train)
word_index_train = tokenizer_train.word_index

#Create en embeding matrix
embedding_matrix = np.zeros((len(word_index) + 1, Word_Dimention))
for word, i in word_index_train.items():
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

#Create train sets
from keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(sequences_train, maxlen=Max_Doc_Lenght)
y_train = data_train.iloc[:, 1].values


## Build the Neuronal Network

In [None]:
classifier = Sequential()

classifier.add(Embedding(len(word_index) + 1, Word_Dimention, weights=[embedding_matrix], input_length = Max_Doc_Lenght, trainable=False))

classifier.add(Convolution1D(Nb_Filter, 5, activation = 'relu'))
classifier.add(MaxPooling1D(pool_size = 5))

classifier.add(Convolution1D(Nb_Filter, 5, activation = 'relu'))
classifier.add(MaxPooling1D(pool_size = 5))

classifier.add(Convolution1D(Nb_Filter, 5, activation = 'relu'))
classifier.add(MaxPooling1D(pool_size = 35))

classifier.add(Flatten())

classifier.add(Dense(output_dim = Last_Hlayer_Size , activation = 'relu'))
classifier.add(Dense(output_dim = 1, activation = 'sigmoid'))

classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])


classifier.fit(X_train, y_train, epochs=Nb_Epochs, batch_size=Batch_Size)

## Evaluate

In [None]:
#Build test feature
corpus_test=[]
for i in range (0,int(min(Size_Data,len(Data))*(r_split))):
    tokens = my_tokenizer(data_test['Articles'][i])
    tokens = ' '.join(tokens)
    corpus_test.append (tokens)

tokenizer_test = Tokenizer(num_words=Max_NB_Word)
tokenizer_test.fit_on_texts(corpus_test)
sequences_test = tokenizer_test.texts_to_sequences(corpus_test)

from keras.preprocessing.sequence import pad_sequences
X_test = pad_sequences(sequences_test, maxlen=Max_Doc_Lenght)
y_test = data_test.iloc[:, 1].values


# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

## Save/Load the model

In [None]:
classifier_json = classifier.to_json()
with open("classifier.json", "w") as json_file:
    json_file.write(classifier_json)
    # serialize weights to HDF5
    classifier.save_weights("classifier.h5")
    print("Saved classifier to disk")
 
    
json_file = open('classifier.json', 'r')
loaded_name_json = json_file.read()
json_file.close()
loaded_name = model_from_json(loaded_name_json)
loaded_name.load_weights("classifier.h5")