In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten,Dropout
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import io
from tensorflow.keras.layers import LSTM,Bidirectional
import pandas as pd
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.models import load_model
import pickle
from sklearn.utils import shuffle
from keras.optimizers import SGD
import tensorflow as tf
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from keras.layers import LeakyReLU

Using TensorFlow backend.


In [2]:
data = pd.read_excel('dataset.xlsx')
sentences = []
labels = []
for row in range(len(data)):
    sentences.append(str(data['Subject'][row]))
    labels.append(str(data['Category'][row]))
labels = [cat.lower() for cat in labels]
sentences = [word.lower() for word in sentences]
X = np.array(sentences)
Y = np.array(labels)
X, Y = shuffle(X,Y, random_state=0)
X_train = X[0:12000]
X_test = X[12000:14814]

#reshape Y
Y = Y.reshape(Y.shape[0],1)
Y_train = Y[0:12000,:]
Y_test = Y[12000:14814,:]
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789',oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,padding='post')
vocab_size = len(word_index)+1

X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test,padding='post',maxlen=len(X_train[1]))
label_encoder  = LabelEncoder()
Y_train = label_encoder.fit_transform(Y_train)
Y_train = to_categorical(Y_train,num_classes=5)
Y_test = label_encoder.fit_transform(Y_test)
Y_test = to_categorical(Y_test,num_classes=5)

In [3]:
# with open('tokenizer_category.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
def keras_model(X_train):
    max_len = len(X_train[0])
    embedding_dim = 16
    vocab_size = len(word_index) + 1 
    model = Sequential()
    model.add(Embedding(vocab_size,embedding_dim,input_length=max_len))
    model.add(GlobalAveragePooling1D())
    #model.add(Flatten())
    model.add(Dense(16,activation = 'relu'))
    model.add(Dense(5,activation='softmax'))
    #model.add(Dense(max_len//2,activation = 'sigmoid'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [6]:
num_epochs = 15
model = keras_model(X_train)
history=model.fit(X_train,Y_train,epochs=num_epochs,validation_data=(X_test,Y_test))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 24, 16)            42208     
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 85        
Total params: 42,565
Trainable params: 42,565
Non-trainable params: 0
_________________________________________________________________
Train on 12000 samples, validate on 2814 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [7]:
#model.save('category_model.h5')

In [8]:
class_dict = {0:"Active Again",1:"Easy to Bank",2:"Onboarding",3:"Regulatory & Mandatory",4:"Take More"}
products = data['Product']
products = [i.lower() for i in products]
X_prod = np.array(sentences)
Y_prod = np.array(products)
X_prod, Y_prod = shuffle(X_prod,Y_prod, random_state=0)
X_train_prod = X_prod[0:12000]
X_test_prod = X_prod[12000:14814]

#reshape Y
Y_prod = Y_prod.reshape(Y_prod.shape[0],1)
Y_train_prod = Y_prod[0:12000,:]
Y_test_prod = Y_prod[12000:14814,:]
tokenizer_k = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789',oov_token="<OOV>")
tokenizer_k.fit_on_texts(X_train_prod)
word_index_k = tokenizer_k.word_index
X_train_prod = tokenizer_k.texts_to_sequences(X_train_prod)
X_train_prod = pad_sequences(X_train_prod,padding='post')
vocab_size_k = len(word_index_k)+1

X_test_prod = tokenizer_k.texts_to_sequences(X_test_prod)
X_test_prod = pad_sequences(X_test_prod,padding='post',maxlen=len(X_train_prod[1]))
label_encoder  = LabelEncoder()
Y_train_prod = label_encoder.fit_transform(Y_train_prod)
Y_train_prod = to_categorical(Y_train_prod,num_classes=8)
Y_test_prod = label_encoder.fit_transform(Y_test_prod)
Y_test_prod = to_categorical(Y_test_prod,num_classes=8)



In [9]:
# with open('tokenizer_product.pickle', 'wb') as handle:
#     pickle.dump(tokenizer_k, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
def keras_prod_model(X_train_prod,vocab_size):
    max_len = len(X_train_prod[0])
    embedding_dim = 16
    #vocab_size = len(word_index_k) + 1 
    model = Sequential()
    model.add(Embedding(vocab_size,embedding_dim,input_length=max_len))
    model.add(GlobalAveragePooling1D())
    #model.add(Flatten())
    model.add(Dense(16,activation = 'relu'))
    model.add(Dense(8,activation='softmax'))
    #model.add(Dense(max_len//2,activation = 'sigmoid'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    model.summary()
    return model

In [11]:
num_epochs = 15
prod_model = keras_prod_model(X_train_prod,vocab_size_k)
prod_history=prod_model.fit(X_train_prod,Y_train_prod,epochs=num_epochs,validation_data=(X_test_prod,Y_test_prod))


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 16)            42208     
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 136       
Total params: 42,616
Trainable params: 42,616
Non-trainable params: 0
_________________________________________________________________
Train on 12000 samples, validate on 2814 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [12]:
#prod_model.save("product_model.h5")

In [13]:
# e1 = model.layers[0]
# weights1 = e1.get_weights()[0]
# print(weights1.shape)
# e2 = prod_model.layers[0]
# weights2 = e2.get_weights()[0]
# print(weights2.shape)

# reverse_word_index1 = dict([(value,key) for (key,value) in word_index.items()])
# reverse_word_index2 = dict([(value,key) for (key,value) in word_index_k.items()])

(2638, 16)
(2638, 16)


In [15]:
# out_v = io.open('vecs1.tsv','w',encoding='utf-8')
# out_m = io.open('meta1.tsv','w',encoding='utf-8')

# for word_num in range(1,weights1.shape[0]):
#     word = reverse_word_index1[word_num]
#     embeddings = weights1[word_num]
#     out_m.write(word + '\n')
#     out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
# out_v.close()
# out_m.close()

In [17]:
# out_v2 = io.open('vecs2.tsv','w',encoding='utf-8')
# out_m2 = io.open('meta2.tsv','w',encoding='utf-8')

# for word_num in range(1,weights2.shape[0]):
#     word = reverse_word_index2[word_num]
#     embeddings = weights2[word_num]
#     out_m2.write(word + '\n')
#     out_v2.write('\t'.join([str(x) for x in embeddings]) + '\n')
# out_v2.close()
# out_m2.close()

In [None]:
def plot_graph(history,string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
plot_graph(history, "acc")
plot_graph(history, "loss")

plot_graph(prod_history, "acc")
plot_graph(prod_history, "loss")