In [3]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# Keras
import tensorflow as tf
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import SGD, Adam, Adadelta, RMSprop
from keras.layers import Embedding,Dense,Dropout,GRU, LSTM
from keras.models import Sequential
import keras.backend as K
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
# Train-Test
from sklearn.model_selection import train_test_split
# Scaling data
from sklearn.preprocessing import StandardScaler
# Classification Report
from sklearn.metrics import classification_report

In [5]:
import tensorflow as tf
tf.test.gpu_device_name()

''

In [7]:
combined = pd.read_csv('../../tmp/sentence_based_data_300K.csv')
combined.head()

Unnamed: 0,text,target
0,"Thereby, the return oil is prevented from goin...",1
1,"', 'While the present inventors examined impro...",0
2,"For example, the inner core portion and the ou...",0
3,"In a case where the traffic amount is small, w...",2
4,"', 'In the twelfth aspect of the invention, ac...",0


Prepare input data

In [8]:
y = combined["target"]

In [9]:
y_cat = to_categorical(y)

In [10]:
texts = combined.text.values

In [11]:
tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(texts)

In [12]:
max_len = 512

In [13]:
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=512)

In [14]:
data.shape

(300000, 512)

In [15]:
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test = train_test_split(data,y_cat,test_size=0.2,random_state=101)
X_val, X_test, y_val, y_test = train_test_split(X_test,y_test,test_size=0.5,random_state=101)

In [16]:
print(len(X_train))
print(len(X_test))
print(len(X_val))

240000
30000
30000


In [17]:
word_index = tokenizer.word_index

In [21]:
GLOVE_FILE = 'C:/Study/glove.6B.100d.txt'
f = open(GLOVE_FILE,'r',encoding="utf8")

In [22]:
embeddings_index = {}
import numpy as np
for line in f:
  values = line.split()
  word = values[0]
  embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()

In [23]:
embedding_matrix = np.random.random((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Normal model

In [24]:
# from keras.optimizers import Adam

model = Sequential()

model.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix],trainable=False))

model.add(LSTM(units=128,dropout = 0.5,recurrent_dropout = 0.2))

model.add(Dense(units=128,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=64,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=32,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=16,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(3,activation="softmax"))
# output model skeleton
opt = Adam(learning_rate = 0.005)
model.summary()
model.compile(optimizer= opt , loss ='categorical_crossentropy',metrics=['accuracy'])
plot_model(model, to_file='/content/drive/MyDrive/dataset_patent/figures/dl_model_anno_50.png', show_shapes=True, show_layer_names=True, dpi=50)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         5669600   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 128)               16512     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                2

In [25]:
model.fit(X_train, y_train, verbose=1, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x18b5c3bd8d0>

In [26]:
model.evaluate(X_test, y_test)



[0.7037455439567566, 0.7116666436195374]

In [27]:
y_pred = model.predict(X_test)



In [28]:
print(y_pred.shape)
print(y_test.shape)

(30000, 3)
(30000, 3)


In [29]:
y_pred = (y_pred > 0.5)

In [30]:
y_pred[0]

array([False,  True, False])

In [31]:
print((classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.76      0.59      0.67      9993
           1       0.79      0.53      0.63     10051
           2       0.84      0.62      0.71      9956

   micro avg       0.80      0.58      0.67     30000
   macro avg       0.80      0.58      0.67     30000
weighted avg       0.80      0.58      0.67     30000
 samples avg       0.58      0.58      0.58     30000



  _warn_prf(average, modifier, msg_start, len(result))


Sequential model LSTM

Without using glove embeddings

In [32]:
len(word_index)

56695

In [35]:
model1 = Sequential()
# model1.add(Embedding(len(word_index)+1, 100, weights=[embedding_matrix]))
model1.add(layers.Embedding(len(word_index)+1, 100, input_length=max_len))
# model1.add(Embedding(max_words, 20))
model1.add(LSTM(64,dropout=0.5))
model1.add(Dense(3,activation='softmax'))

model1.summary()
model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
# checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
plot_model(model1, to_file='dl_model_lstm_wo_anno_400.png', show_shapes=True, show_layer_names=True, dpi=400)
plot_model(model1, to_file='dl_model_lstm_wo_anno_50.png', show_shapes=True, show_layer_names=True, dpi=50)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 512, 100)          5669600   
                                                                 
 lstm_3 (LSTM)               (None, 64)                42240     
                                                                 
 dense_7 (Dense)             (None, 3)                 195       
                                                                 
Total params: 5,712,035
Trainable params: 5,712,035
Non-trainable params: 0
_________________________________________________________________
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.
You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [36]:
# history = model1.fit(X_train, y_train, epochs=70,validation_data=(X_test, y_test),callbacks=[checkpoint1])
history_1 = model1.fit(X_train, y_train, validation_data=(X_val, y_val), verbose=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [1]:
a = len(history_1.history["loss"])

NameError: name 'history_1' is not defined