In [1]:
import pandas as pd
import numpy as np
import string
import re
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras
from keras.layers import Dense, Embedding, Input, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize





In [2]:
df= pd.read_csv(r"D:\Datasets\medium_data_Bidirectional.csv\medium_data.csv")

In [3]:
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [4]:
# import PIL
# import urllib.request
# img= np.array(PIL.Image.open(urllib.request.urlopen(df['url'][0]+'/'+df['image'][0])))

In [5]:
df.shape

(6508, 10)

In [6]:
# out of all these columns we are intrested in only title column 
df['title'][1:10]

1    Hands-on Graph Neural Networks with PyTorch & ...
2                         How to Use ggplot2 in Python
3    Databricks: How to Save Files in CSV on Your L...
4    A Step-by-Step Implementation of Gradient Desc...
5      An Easy Introduction to SQL for Data Scientists
6                        Hypothesis testing visualized
7    Introduction to Latent Matrix Factorization Re...
8         Which 2020 Candidate is the Best at Twitter?
9            What if AI model understanding were easy?
Name: title, dtype: object

## Data Preprocessing

In [7]:
# Remove unwanted character and words in the title

# lower the words
df['clean_data']= [i.lower() for i in df['title']]

# remove punctuations 
df['clean_data']= [i.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for i in df['clean_data']]

# replace space and escape characters
df['clean_data']= df['clean_data'].apply(lambda x: x.replace('\xa0', ' '))
df['clean_data']= df['clean_data'].apply(lambda x: x.replace('\u200a', ' '))



In [8]:
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date,clean_data
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30,a beginner’s guide to word embedding with gens...
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30,hands on graph neural networks with pytorch ...
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30,how to use ggplot2 in python
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30,databricks how to save files in csv on your l...
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30,a step by step implementation of gradient desc...


In [9]:
df['title'][124]

'Learn From My Mistake. NEVER Delete Your\xa0Website!'

In [10]:
df['clean_data'][124]

'learn from my mistake  never delete your website '

### Tokenize

In [11]:
tokenizer = Tokenizer() # For those words which are not found in word_index
tokenizer.fit_on_texts(df['clean_data'])

total_words= len(tokenizer.word_index) + 1

print('Total words: ',total_words)
print('Word: ID')
print('---------------------------------')
# print('<oov>: ', tokenizer.word_index['<oov>'])
print('strong: ', tokenizer.word_index['strong'])
print('And: ', tokenizer.word_index['and'])


Total words:  8233
Word: ID
---------------------------------
strong:  3
And:  7


In [12]:
tokenizer.word_index

{'to': 1,
 'the': 2,
 'strong': 3,
 'a': 4,
 'of': 5,
 'how': 6,
 'and': 7,
 'in': 8,
 'your': 9,
 'markup': 10,
 'for': 11,
 'you': 12,
 'with': 13,
 'is': 14,
 '—': 15,
 'data': 16,
 'why': 17,
 'class': 18,
 'h3': 19,
 'i': 20,
 'what': 21,
 'on': 22,
 'learning': 23,
 'from': 24,
 'an': 25,
 'be': 26,
 'my': 27,
 'writing': 28,
 'are': 29,
 'it': 30,
 'can': 31,
 'using': 32,
 'design': 33,
 'machine': 34,
 'ux': 35,
 'about': 36,
 'do': 37,
 'not': 38,
 'python': 39,
 'ai': 40,
 'life': 41,
 'that': 42,
 'when': 43,
 'should': 44,
 'we': 45,
 '5': 46,
 'science': 47,
 'make': 48,
 'time': 49,
 'need': 50,
 'as': 51,
 '3': 52,
 'more': 53,
 'at': 54,
 'business': 55,
 'or': 56,
 'part': 57,
 'have': 58,
 'work': 59,
 'new': 60,
 'don’t': 61,
 'up': 62,
 'by': 63,
 'write': 64,
 'get': 65,
 'use': 66,
 'guide': 67,
 'will': 68,
 'marketing': 69,
 '1': 70,
 'ways': 71,
 'deep': 72,
 'best': 73,
 'analysis': 74,
 'first': 75,
 '2019': 76,
 'product': 77,
 'better': 78,
 'things': 79,


## Convert Text To Sequences


Convert Titles to Sequences: Use a tokenizer to turn each title into a string of tokens or manually separate each slip into its constituent words. Assign each word in the lexicon a distinct number index.

Generate n-grams: From the sequences, make n-grams. A continuous run of n-title tokens is called an n-gram.

Count the Frequency: Determine the frequency at which each n-gram appears in the dataset.

Build the n-gram Model: Create the n-gram model using the n-gram frequencies. The model keeps track of each token probability given the previous n-1 tokens. This can be displayed as a lookup table or a dictionary.

Predict the Next Word: The expected next token in an n-1-token sequence may be identified using the n-gram model. To do this, it is necessary to find the probability in the algorithm and select a token with the greatest likelihood.


In [13]:
input_seq= []
# for line in df['title']:
input_seq.append(tokenizer.texts_to_sequences(df['clean_data']))

In [14]:
len(input_seq)

1

In [15]:
# input_seq[0]

In [16]:
len(input_seq[0])

6508

## Make all the Sequences as same length By Padding



Find the longest title in your dataset by comparing all the other titles.

Repeat this process for each title, comparing each one’s length to the overall limit.

When a title is too short, it should be extended using a specific padding token or character.

For each title in your dataset, carry out the padding procedure again.


In [17]:
# pad sequences
max_seq_len= max([len(x) for x in input_seq[0]])
print("max_seq_len:", max_seq_len)

input_seq= pad_sequences(input_seq[0], maxlen= max_seq_len, padding= 'pre')
print("Total input sequences: ", len(input_seq))

max_seq_len: 40
Total input sequences:  6508


In [26]:
input_seq.shape

(6508, 40)

### Prepare Features and Labels

In [22]:
# create a timestep to predict a next word
xs, labels= input_seq[:,:-1], input_seq[:,-1]
xs_= np.array(xs)
labels_= np.array(labels)

print(f"\n shape of xs: {xs_.shape,len(xs)} " )
print(f"\n shape of xs[1]: {len(xs[1])} \n\n current encoded words-> xs[1]:",xs[1], "\n")
print(f"\n shape of labels: {labels_.shape, len(labels)} \n\nnext time-step-> labels[1]:",labels[1], '\n')


 shape of xs: ((6508, 39), 6508) 

 shape of xs[1]: 39 

 current encoded words-> xs[1]: [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0 1858   22  740   80  102   13  344  344] 


 shape of labels: ((6508,), 6508) 

next time-step-> labels[1]: 1859 



### Bi-Directional LSTM

In [31]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length= max_seq_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation= 'softmax'))
adam= Adam(learning_rate= 0.01)

model.compile(loss= 'sparse_categorical_crossentropy', optimizer= adam, metrics= ['accuracy'])
history= model.fit(xs, labels, epochs= 5, verbose= 1)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Plot the Model Accuracy and Loss

In [None]:
def graph_plot(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()
    
graph_plot(history, 'accuracy')

In [None]:
graph_plot(history, 'loss')

### Predicting Next Word


In [None]:
seed_text='implementation of'
next_words= 2

for _ in range(next_words):
    token_list= tokenizer.texts_to_sequences([seed_text])[0]
    print(token_list)
    token_list= pad_sequences([token_list], maxlen= max_seq_len-1, padding= 'pre')
    print(len(token_list))
    predicted= model.predict(token_list, verbose= 0)
    pred_class= np.argmax(predicted)
    print(pred_class)
    output_word= ''
    
    for word, index in tokenizer.word_index.items():
#         print(word)
        if index == pred_class:
            output_word= word
            break
            
    seed_text+= " "+output_word
    
print(seed_text)
        
    