In [2]:
import pandas as pd
import numpy as np

In [3]:
df=pd.read_csv('medium_data.csv')

In [4]:
df.head()

Unnamed: 0,id,url,title,subtitle,image,claps,responses,reading_time,publication,date
0,1,https://towardsdatascience.com/a-beginners-gui...,A Beginner’s Guide to Word Embedding with Gens...,,1.png,850,8,8,Towards Data Science,2019-05-30
1,2,https://towardsdatascience.com/hands-on-graph-...,Hands-on Graph Neural Networks with PyTorch & ...,,2.png,1100,11,9,Towards Data Science,2019-05-30
2,3,https://towardsdatascience.com/how-to-use-ggpl...,How to Use ggplot2 in Python,A Grammar of Graphics for Python,3.png,767,1,5,Towards Data Science,2019-05-30
3,4,https://towardsdatascience.com/databricks-how-...,Databricks: How to Save Files in CSV on Your L...,When I work on Python projects dealing…,4.jpeg,354,0,4,Towards Data Science,2019-05-30
4,5,https://towardsdatascience.com/a-step-by-step-...,A Step-by-Step Implementation of Gradient Desc...,One example of building neural…,5.jpeg,211,3,4,Towards Data Science,2019-05-30


In [5]:
df.shape

(6508, 10)

In [6]:
df['title'].sample(10)

3542                           Your Writing Voice Matters
4188               All them A/B tests that never happened
4230    Here Are the Best Lessons I Learned from My Th...
5002    Top influential women in blockchain and crypto...
4548                Making mayonnaise and design thinking
828     Digital Routines & Automating the User Experie...
4122    What I learned by earning my first $6.58 on Me...
894     Can AI Robots Rebel Against Us, as Shown in Fi...
6043    What I wish someone had told me before startin...
781             The Key When Life Gets Rough — Keep Doing
Name: title, dtype: object

In [7]:
# We will use inly title column for the training purpose
# There are some unwanted characters in the data 

df['title'] = df['title'].apply(lambda x: x.replace(u'\xa0',u' '))
df['title'] = df['title'].apply(lambda x: x.replace('\u200a',' '))

In [8]:
# We are basically trying to convert this problem into a simple supervised classification task



In [9]:
# Builiding the dataset for the same

In [10]:
import tensorflow as tf

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [12]:
tokenizer=Tokenizer()

In [13]:
tokenizer.fit_on_texts(df['title'])

In [14]:
vocab_len=len(tokenizer.word_index)


In [15]:
corpus=df['title']

In [16]:
#  Tokenizing the data based upon the vocab created
X=[]
for i in corpus:
    sent=tokenizer.texts_to_sequences([i])[0]

    for x in range(1,len(sent)):
        n_grams=sent[:x+1]
        X.append(n_grams)

In [17]:
tokenizer.index_word[675]
corpus[0]

'A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model'

In [18]:
#  Applying padding 
# Checking max len

max_len=max([len(i) for i in X])


In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [20]:
padded_X=pad_sequences(X,maxlen=max_len,padding='pre')

In [21]:
X=padded_X[:,:-1]
Y=padded_X[:,-1]


In [22]:
Y.shape

(48461,)

In [23]:
X.shape

(48461, 39)

In [24]:
# Converting the output into one hot encoded form

from tensorflow.keras.utils import to_categorical
Y = to_categorical(Y,num_classes=vocab_len+1)

In [25]:
Y.shape

(48461, 8237)

In [26]:
Y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [27]:
#  Making the Model
from tensorflow.keras.layers import Dense,LSTM,Bidirectional,Embedding,Input
from tensorflow.keras.models import Sequential



In [28]:
max_len

40

In [31]:
model = Sequential()
model.add(Input(shape=[39]))
model.add(Embedding(vocab_len+1, 100))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_len+1, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
model.summary()

In [37]:
model.fit(X,Y,epochs=50,verbose=1,validation_split=0.33,batch_size=200)

Epoch 1/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 263ms/step - accuracy: 0.4745 - loss: 2.8102 - val_accuracy: 0.1492 - val_loss: 7.6543
Epoch 2/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 226ms/step - accuracy: 0.4963 - loss: 2.6722 - val_accuracy: 0.1509 - val_loss: 7.7230
Epoch 3/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 223ms/step - accuracy: 0.5222 - loss: 2.5587 - val_accuracy: 0.1480 - val_loss: 7.7715
Epoch 4/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 225ms/step - accuracy: 0.5444 - loss: 2.4496 - val_accuracy: 0.1494 - val_loss: 7.8234
Epoch 5/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 225ms/step - accuracy: 0.5635 - loss: 2.3463 - val_accuracy: 0.1486 - val_loss: 7.8825
Epoch 6/50
[1m163/163[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 227ms/step - accuracy: 0.5800 - loss: 2.2493 - val_accuracy: 0.1456 - val_loss: 7.9299
Epoch 7/50

KeyboardInterrupt: 

In [64]:
corpus.sample()

6114    The slow rise of the political video meme
Name: title, dtype: object

In [48]:
import time

In [65]:
text="slow rise of"
for i in range(12):
    token_text=tokenizer.texts_to_sequences([text])[0]
    padded_token=pad_sequences([token_text],maxlen=max_len,padding='pre')
    pos=np.argmax(model.predict(padded_token))
    word=tokenizer.index_word[pos]
    text=text+" "+word
    print (text)
    time.sleep(1)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
slow rise of 5g
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
slow rise of 5g personas
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
slow rise of 5g personas and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
slow rise of 5g personas and make
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
slow rise of 5g personas and make learn
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
slow rise of 5g personas and make learn from
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
slow rise of 5g personas and make learn from 3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
slow rise of 5g personas and make learn from 3 three
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
slow rise of 5g personas and make learn from 3 three steps
[1m

In [69]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()