In [1]:
!unzip Data.zip -d Data

Archive:  Data.zip
   creating: Data/Data/
  inflating: Data/Data/GBvideos.csv  
  inflating: Data/Data/CA_category_id.json  
  inflating: Data/Data/US_category_id.json  
  inflating: Data/Data/USvideos.csv  
  inflating: Data/Data/CAvideos.csv  
  inflating: Data/Data/GB_category_id.json  


#Extracting Data  

In [2]:
import pandas as pd
import numpy as np
import json

In [3]:

#load all the datasets 
df1 = pd.read_csv('/content/Data/Data/USvideos.csv')
df2 = pd.read_csv('/content/Data/Data/CAvideos.csv')
df3 = pd.read_csv('/content/Data/Data/GBvideos.csv')

#load the datasets containing the category names
data1 = json.load(open('/content/Data/Data/US_category_id.json'))
data2 = json.load(open('/content/Data/Data/CA_category_id.json'))
data3 = json.load(open('/content/Data/Data/GB_category_id.json'))

In [4]:
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']["title"] for i in range(len(data['items']))]
    i_d = list(map(int, i_d))
    category = zip(i_d, title)
    category = dict(category)
    return category


In [5]:
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

In [6]:
df = pd.concat([df1, df2, df3], ignore_index=True)
df = df.drop_duplicates('video_id')

In [8]:
entertainment = df[(df['category_title'] == 'Entertainment') ]['title']#| (df['category_title']=='Science & Technology')
entertainment = entertainment.tolist()

# Data Preprocessing

In [9]:
entertainment[:5]

['The Trump Presidency: Last Week Tonight with John Oliver (HBO)',
 'Nickelback Lyrics: Real or Fake?',
 'I Dare You: GOING BALD!?',
 'Roy Moore & Jeff Sessions Cold Open - SNL',
 "(SPOILERS) 'Shiva Saves the Day' Talked About Scene Ep. 804 | The Walking Dead"]

In [13]:
import string
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii', 'ignore')
    return text

corpus = [clean_text(e) for e in entertainment]

In [14]:
corpus[:10]

['the trump presidency last week tonight with john oliver hbo',
 'nickelback lyrics real or fake',
 'i dare you going bald',
 'roy moore  jeff sessions cold open  snl',
 'spoilers shiva saves the day talked about scene ep 804  the walking dead',
 'spaghetti burrito vs spaghetti burrito',
 'amazon christmas advert 2017  toys  games',
 'whats inside a detectives car',
 'people are awesome  the pet collective present pets are awesome',
 'elders react to iphone x facial recognition animojis']

In [15]:
from keras.preprocessing.text import Tokenizer
token = Tokenizer()
def train_label(data):
  
  words = token.fit_on_texts(data)
  total_words = len(token.word_index)+1
  input_data = []
  for line in data:
    l = token.texts_to_sequences([line])[0]
    for i in range(1,len(l)):
      d = l[:i+1]
      input_data.append(d)
  return input_data,total_words



In [16]:
inp,totword = train_label(corpus)

In [17]:
inp[:15]

[[1, 88],
 [1, 88, 4064],
 [1, 88, 4064, 70],
 [1, 88, 4064, 70, 368],
 [1, 88, 4064, 70, 368, 1313],
 [1, 88, 4064, 70, 368, 1313, 11],
 [1, 88, 4064, 70, 368, 1313, 11, 137],
 [1, 88, 4064, 70, 368, 1313, 11, 137, 1087],
 [1, 88, 4064, 70, 368, 1313, 11, 137, 1087, 1603],
 [6138, 1432],
 [6138, 1432, 139],
 [6138, 1432, 139, 94],
 [6138, 1432, 139, 94, 399],
 [26, 1828],
 [26, 1828, 27]]

In [18]:
maxlen = max([len(x) for x in inp])

In [19]:
maxlen

27

In [20]:
from keras.preprocessing.sequence import pad_sequences
paddata = pad_sequences(inp, maxlen=maxlen,  padding='pre')

In [21]:
paddata

array([[   0,    0,    0, ...,    0,    1,   88],
       [   0,    0,    0, ...,    1,   88, 4064],
       [   0,    0,    0, ...,   88, 4064,   70],
       ...,
       [   0,    0,    0, ...,   13, 2305,   23],
       [   0,    0,    0, ..., 2305,   23,   17],
       [   0,    0,    0, ...,   23,   17,  102]], dtype=int32)

In [22]:
X = paddata[:,:-1]
Y = paddata[:,-1]

In [23]:
X

array([[   0,    0,    0, ...,    0,    0,    1],
       [   0,    0,    0, ...,    0,    1,   88],
       [   0,    0,    0, ...,    1,   88, 4064],
       ...,
       [   0,    0,    0, ...,   19,   13, 2305],
       [   0,    0,    0, ...,   13, 2305,   23],
       [   0,    0,    0, ..., 2305,   23,   17]], dtype=int32)

In [24]:
Y

array([  88, 4064,   70, ...,   23,   17,  102], dtype=int32)

In [25]:
# Converting the OutPut into one hot encoding for language model

In [26]:
from keras.utils import np_utils

In [27]:
Y2 = np_utils.to_categorical(Y, num_classes = totword)

In [28]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,LSTM,Dropout

In [29]:
model=Sequential()
model.add(Embedding(input_dim=totword+1,output_dim=10,input_length=maxlen-1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(totword, activation='softmax'))

In [30]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
             )

In [59]:
mf=model.fit(X, Y2,verbose=5,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [60]:
def generate_text(seed_text, next_words, model, max_sequence_len):
  for _ in range(next_words):
    token_list = token.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1,  padding='pre')
    predicted = model.predict(token_list, verbose=0)[0]
    predicted = np.argmax(predicted)
    print('Pred : ',predicted)
    output_word = ''
    for word,index in token.word_index.items():

      if index == predicted:
        output_word = word
        break
    seed_text += ' '+output_word
  return seed_text.title()

In [2]:
generate_text('how are you?',4,model,maxlen)

NameError: ignored

In [1]:
sent = 'how are you?'
tok = token.texts_to_sequences([sent])[0]
pad = pad_sequences([tok], maxlen=maxlen-1,  padding='pre')

NameError: ignored

In [44]:
val = model.predict(pad)

array([[1.9516223e-07, 3.5129525e-02, 1.0384531e-03, ..., 8.5130232e-06,
        2.4927067e-05, 1.4446091e-05]], dtype=float32)

In [50]:
from math import log
# beam search
def beam_search_decoder(data, k):
	sequences = [[list(), 0.0]]
	# walk over each step in sequence
	for row in data:
		all_candidates = list()
		# expand each current candidate
		for i in range(len(sequences)):
			seq, score = sequences[i]
			for j in range(len(row)):
				candidate = [seq + [j], score - log(row[j])]
				all_candidates.append(candidate)
		# order all candidates by score
		ordered = sorted(all_candidates, key=lambda tup:tup[1])
		# select k best
		sequences = ordered[:k]
	return sequences

In [51]:
beam_search_decoder(val,3)

[[[1], 3.3487133406240037],
 [[5], 4.119537244960633],
 [[4], 4.1719152533553805]]

In [52]:
token.word_index.items()



In [None]:
sent2 = 'hello'

In [None]:
def form_sent(sent):
  pass
  tok = token.texts_to_sequences([sent])[0]
  pad = pad_sequences([tok], maxlen=maxlen-1,  padding='pre')

  sent = 
