# 10 Questions 

In [48]:
import tensorflow as tf
from tensorflow.keras.optimizers import SGD, Adam, Nadam, RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Activation, Dropout, Flatten, LSTM
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import text_to_word_sequence, one_hot, Tokenizer
from tensorflow.keras.constraints import max_norm
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.applications import Xception
from tensorflow.keras import regularizers
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing import sequence
import numpy as np
import pandas as pd
import cv2
import os
import glob
import math
import matplotlib.pyplot as plt
import pickle




In [49]:
# Sample questions and intents
train_df = pd.read_excel(f"Questions.xlsx") # Loading a excel file with headers
train_df.head()

Unnamed: 0,Question,scientific contributions,affiliations and locations,awards and recognitions,biography,influences and impact
0,What did Marie Curie discover?,1,0,0,0,0
1,What is radium?,1,0,0,0,0
2,What is polonium?,1,0,0,0,0
3,How did Marie Curie study radioactivity?,1,0,0,0,0
4,What was Marie Curie's role in developing X-ra...,1,0,0,0,0


In [50]:
train_df = train_df.sample(frac=1)
train_df.head()

Unnamed: 0,Question,scientific contributions,affiliations and locations,awards and recognitions,biography,influences and impact
149,What was the significance of Marie Curie’s wor...,0,0,0,0,1
88,What awards did Marie Curie receive for her wo...,0,0,1,0,0
72,What recognitions did Marie Curie receive for ...,0,0,1,0,0
87,How did Marie Curie’s achievements shape her l...,0,0,1,0,0
98,What accolades did Marie Curie receive from ac...,0,0,1,0,0


In [51]:
X_train = train_df["Question"].fillna("fillna").values
Y_train = train_df[[ 
    "scientific contributions",
    "affiliations and locations",
    "awards and recognitions",
    "biography",
    "influences and impact"
    ]].values

In [52]:
print("Shape of X_train:",X_train.shape)
print("Shape of Y_train:",Y_train.shape)

Shape of X_train: (190,)
Shape of Y_train: (190, 5)


In [53]:
Tokenizer = Tokenizer()
texts = X_train
Tokenizer.fit_on_texts(texts)
Tokenizer_vocab_size = len(Tokenizer.word_index) + 1
print("Tokenizer vocabulary size:",Tokenizer_vocab_size)

Tokenizer vocabulary size: 216


In [54]:
len(max(X_train,key=len))

86

In [55]:
maxWordCount= 100
maxDictionary_size=Tokenizer_vocab_size

num_test_samples = 50 # Test samples for validation


# Phase 1: Setting up data for training
X_train = X_train[num_test_samples:] # 50 samples to n ----> Sentence (Input)
Y_train = Y_train[num_test_samples:] # 50 samples to n ----> Labels (Output)

# Phase 2: Setting up data for validation
X_val = X_train[:num_test_samples] # First 51 Samples --> Sentence (Input)
Y_val = Y_train[:num_test_samples] # First 51 Samples --> Labels (Output)

print("(Input->Question) Length of X_train:",X_train.shape) # Input -> Input
print("(output->Labels) Length of Y_train:",Y_train.shape) # output -> Labels

(Input->Question) Length of X_train: (140,)
(output->Labels) Length of Y_train: (140, 5)


In [56]:
X_train_encoded_words = Tokenizer.texts_to_sequences(X_train)
X_val_encoded_words = Tokenizer.texts_to_sequences(X_val)

X_train_encoded_padded_words = sequence.pad_sequences(X_train_encoded_words, maxlen=maxWordCount)
X_val_encoded_padded_words = sequence.pad_sequences(X_val_encoded_words, maxlen=maxWordCount)

# LSTM Model

In [57]:
#model
model = Sequential()

model.add(Embedding(maxDictionary_size, 32, input_length=maxWordCount)) #to change words to ints

#hidden layers
model.add(LSTM(10))

model.add(Dropout(0.5))

model.add(Dense(units=1200, activation='relu', kernel_constraint=max_norm(1)))

model.add(Dropout(0.5))

model.add(Dense(units=500, activation='relu', kernel_constraint=max_norm(1)))

 #output layer
model.add(Dense(5, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 100, 32)           6912      
                                                                 
 lstm_2 (LSTM)               (None, 10)                1720      
                                                                 
 dropout_3 (Dropout)         (None, 10)                0         
                                                                 
 dense_6 (Dense)             (None, 1200)              13200     
                                                                 
 dropout_4 (Dropout)         (None, 1200)              0         
                                                                 
 dense_7 (Dense)             (None, 500)               600500    
                                                                 
 dense_8 (Dense)             (None, 5)                

# Train Model

In [64]:

epochs = 25
batch_size = 32 #32

nadam = Nadam(learning_rate=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

# Compile model
model.compile(loss='categorical_crossentropy', optimizer=nadam, metrics=['accuracy'])

history  = model.fit(X_train_encoded_padded_words,Y_train, epochs = epochs, batch_size=batch_size, verbose=1,
                    validation_data=(X_val_encoded_padded_words, Y_val))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [66]:
# Tokenization and padding process
phrase = "What did Marie Curie discover?"
tokens = Tokenizer.texts_to_sequences([phrase])
tokens = pad_sequences(tokens, maxlen=100)
prediction = model.predict(np.array(tokens))

i,j = np.where(prediction == prediction.max()) #calculates the index of the maximum element of the array across all axis
# i->rows, j->columns
i = int(i)
j = int(j)

print(prediction)
total_possible_outcomes = [    
    "scientific contributions",
    "affiliations and locations",
    "awards and recognitions",
    "biography",
    "influences and impact"
    ]
print("Result:",total_possible_outcomes[j])

[[9.9658465e-01 2.4312774e-03 5.0097728e-06 3.5526366e-06 9.7547309e-04]]
Result: scientific contributions


In [67]:

model.save('Trained Models/intent_classification_model.h5')


In [68]:
model.save_weights('Trained Models/intent_classification_weights.h5')


In [69]:
with open('Trained Models/tokenizer.pkl', 'wb') as f:
    pickle.dump(Tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)