In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import nltk
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

# Fetch data

In [1]:
a = open('/content/drive/MyDrive/SH1.txt')

In [2]:
w = a.read()

# Preprocessing

In [5]:
import re
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

words = []
for i in w.split(" "):
    upr = i.upper()
    splch = re.sub("[^A-Z0-9]","",upr)
    stemmed = ps.stem(splch)
    lem = lm.lemmatize(stemmed)
    words.append(lem.upper())

# Convert data into X and Y format

In [6]:
str_X = []
str_Y = []

In [7]:
len(words)

3271

In [8]:
for i in range(3,len(words),1):
    fw = words[i-3]
    sw = words[i-2]
    tw = words[i-1]
    fnw = words[i]
    str_X.append(str(fw) + " " + str(sw) + " " + str(tw))
    str_Y.append(str(fnw))

In [9]:
import pandas as pd
Q = pd.DataFrame([str_X,str_Y]).T
Q.columns = ["X","Y"]

In [10]:
Q.head()

Unnamed: 0,X,Y
0,PROJECT GUTENBERG THE,ADVENTUR
1,GUTENBERG THE ADVENTUR,OF
2,THE ADVENTUR OF,SHERLOCK
3,ADVENTUR OF SHERLOCK,HOLM
4,OF SHERLOCK HOLM,BY


# convert data into an array

In [11]:
from numpy import unique
uwords = unique(words)

In [12]:
len(uwords)

1241

In [13]:
len(str_X)

3268

# create blank arrays

In [14]:
import numpy as np
X_arr = np.zeros((len(str_X),len(uwords),3),dtype=bool)
Y_arr = np.zeros((len(str_X),len(uwords)),dtype=bool)

# Create a dictionary containing words and its position

In [15]:
word_position_finder = {}
for i in range(0,len(uwords),1):
    word_position_finder[uwords[i]] = i

In [16]:
for i,j in enumerate(str_X):
    for j,k in enumerate(j.split(" ")):
        pos = word_position_finder[k]
        X_arr[i,pos,j]=1

In [17]:
for i,j in enumerate(str_Y):
    pos = word_position_finder[j]
    Y_arr[i,pos]=1

In [18]:
Y_arr

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

# Create neural network

In [19]:
from keras.layers import Dense,LSTM
from keras.models import Sequential

In [20]:
len(uwords)

1241

In [21]:
nn = Sequential()
nn.add(LSTM(64, input_shape=(len(uwords),3)))
nn.add(Dense(len(uwords), activation='softmax'))

In [22]:
nn.compile(optimizer='adam', loss="categorical_crossentropy",  metrics=['accuracy'])
nn.fit(X_arr, Y_arr, epochs=5, batch_size=100)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f6c702b2fd0>

# Prediction

In [23]:
input_X_arr = np.zeros((1,len(uwords),3),dtype=bool)

In [27]:
sent = input("Enter a sentence [3 words only]: ")

Enter a sentence [3 words only]: PROJECT GUTENBERG THE


In [28]:
sent = sent.upper()

In [29]:
for j,k in enumerate(sent.split(" ")):
    pos = word_position_finder[k]
    input_X_arr[0,pos,j]=1

In [30]:
R = pd.DataFrame([uwords,model.predict(input_X_arr)[0]]).T
R.columns= ["word","prob"]

In [31]:
pred = R.sort_values(by="prob",ascending=False).head(1).word.values[0]

In [32]:
print("Predicted value is --> ",pred)

Predicted value is -->  THE
