In [3]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk import sent_tokenize
from tensorflow import keras 
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Mount GDrive 
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [363]:
data = pd.read_csv("/gdrive/MyDrive/Summarizer/structured_df.csv", index_col = 0)
data.head()

Unnamed: 0,course_title,section_topic,subsection_topic,lecture_topic,content
0,advanced-methods-reinforcement-learning-finance,other-applications-of-reinforcement-learning-p...,lesson-1,trades-quotes-and-order-flow,All right. Let's first talk about what type of...
1,advanced-methods-reinforcement-learning-finance,other-applications-of-reinforcement-learning-p...,lesson-1,electronic-markets-and-lob,"So, so far in this specialization our examples..."
2,advanced-methods-reinforcement-learning-finance,other-applications-of-reinforcement-learning-p...,lesson-1,welcome,Welcome to week four of our course on events t...
3,advanced-methods-reinforcement-learning-finance,other-applications-of-reinforcement-learning-p...,lesson-1,limit-order-book,"Now, let's talk a bit more about the Limit Ord..."
4,advanced-methods-reinforcement-learning-finance,other-applications-of-reinforcement-learning-p...,lesson-1,lob-modeling,"Now, after we talked about what the LOB is, le..."


In [126]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

texts = data.content.values
all_sentences = [sent_tokenize(t) for t in texts]
#sentences = sent_tokenize(text)

In [145]:
flat_list = [item for sublist in all_sentences for item in sublist]

### Creating the Tokenizer and Tokenizing all the sentences

In [229]:
# Creating the Tokenizer and Tokenizing all the sentences
tokenizer = Tokenizer( oov_token="[UNK]")
tokenizer.fit_on_texts(flat_list)
word_index = tokenizer.word_index
inv_word_index = {v: k for k, v in word_index.items()}
sequences = tokenizer.texts_to_sequences(flat_list)

In [230]:
word_index[""]=0
inv_word_index[0]=""

In [238]:
vocab = len(word_index)

### Creating the word tokenized form of all of the sentences so that we can form the word embedding matrix of each of the texts 

In [202]:
# Creating the word tokenization of all of the sentences and storing it 
all_sent_word_list = []
for j in sequences: 
  word_list = [inv_word_index[i] for i in j]
  all_sent_word_list.append(word_list)

all_sent_word_list[:2]


[['all', 'right'],
 ["let's",
  'first',
  'talk',
  'about',
  'what',
  'type',
  'of',
  'data',
  'is',
  'available',
  'within',
  'order',
  'driven',
  'market']]

### Finding the distribution of sentence lengths so that we can Pad Sequences 

In [203]:
# Removing Erroneous Entries where the entire paragraph is a sentence due to error in data format
sent_lengths = []
for i in all_sent_word_list:
  current_len = len(i)
  if current_len <300:
    sent_lengths.append(current_len)
  else:
    print("Removing entry {1} with contents {0}".format(i, all_sent_word_list.index(i)))
    all_sent_word_list.remove(i)


Removing entry 134379 with contents ['hi', 'there', "i'm", 'david', 'dye', 'and', "i'll", 'be', 'taking', 'through', 'the', 'last', 'few', 'modules', 'of', 'this', 'course', 'in', 'this', 'module', "we'll", 'start', 'to', 'use', 'the', 'calculus', "we've", 'done', 'and', 'put', 'it', 'together', 'with', 'vectors', 'in', 'order', 'to', 'start', 'solving', 'equations', 'in', 'this', 'first', 'video', "we'll", 'look', 'at', 'a', 'nice', 'simple', 'case', 'where', 'we', 'just', 'need', 'to', 'find', 'the', 'gradient', 'the', 'derivative', 'in', 'order', 'to', 'solve', 'an', 'equation', 'using', "what's", 'called', 'the', 'newton', 'raphson', 'method', 'now', 'say', "we've", 'got', 'that', 'distribution', 'of', 'heights', 'again', 'with', 'a', 'mean', 'an', 'average', 'mu', 'and', 'a', 'width', 'sigma', 'and', 'we', 'want', 'to', 'fit', 'an', 'equation', 'to', 'that', 'distribution', 'that', 'so', 'we', "don't", 'have', 'to', 'if', 'we', 'fitted', 'it', 'bother', 'about', 'carrying', 'aroun

In [223]:
# Checking how man sentence lengths greater than 50
large = [i for i in sent_lengths if i>50]

# We see that most of the lengths are rather small so we will pad sequences to 100 and keep 100 tokens as max sequence length 

precentage_loss= len(large)/len(sent_lengths)
print(precentage_loss)

0.01366009414736064


## Forming Document Dictionary

In [240]:
# Forming a Dictionary where each entry in the dict is a list of lists that consists of the words in each sentence for all sentences in the transcript
i = 0 
doc_dict = {}
for test_seq in all_sentences:
  i +=1
  doc_seq = tokenizer.texts_to_sequences(test_seq)
  padded_seq = pad_sequences(doc_seq, maxlen=50, dtype="long", value=0, truncating="post", padding="post")
  doc_sent_words = []
  for j in padded_seq:
    #print(j) 
    sent_words = [inv_word_index[i] for i in j]
    doc_sent_words.append(sent_words)
    doc_dict[i]= doc_sent_words

In [354]:
doc_dict[79][1]

['ai',
 'is',
 'changing',
 'the',
 'way',
 'we',
 'work',
 'and',
 'live',
 'and',
 'this',
 'nontechnical',
 'course',
 'will',
 'teach',
 'you',
 'how',
 'to',
 'navigate',
 'the',
 'rise',
 'of',
 'ai',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

# Downloading the GloVe Embeddings 

In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2020-12-22 01:57:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-12-22 01:57:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-12-22 01:57:58--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-1

## Forming the Glove Embeddings Index

In [333]:
import os
import pathlib
import numpy as np

path_to_glove_file = "/content/glove.6B.200d.txt"

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [235]:
# 
num_tokens = len(word_index) + 2
embedding_dim = 200
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 22437 words (5449 misses)


# Now we will go document by document and form the average document embeddings

In [335]:
doc_matrix_dict = {}
for doc_id, doc in doc_dict.items():
  sent_matrix = np.empty((0,200), float)
  sentence_count = 0
  doc_matrix_dict[doc_id] ={}  
  #print(doc_id, doc_matrix_dict[doc_id])
  for sent in doc:
    sentence_count += 1
    embedding_matrix = []
    for word in sent:
        word_vect = embeddings_index.get(word)
        if word_vect is not None:
          embedding_matrix.append(word_vect)                                           # Get the word embedding for each word 
        
    embedding_array = np.array(embedding_matrix)                                       # Store all the word embedding in a sentence in an array
    sent_embedding = np.mean(embedding_array, axis = 0 )                               # Get the sentence embedding as average of all word embedding in the sentence 
    try:
      sent_matrix = np.append(sent_matrix, np.array([sent_embedding]),axis = 0)        # Store all the sentence embedding in sent_matrix                           
    except:
      print("Skipping concatenation at doc",doc_id,"sentence", sentence_count,"due to [nan]/missing value error") 

    doc_matrix_dict[doc_id][sentence_count] = sent_embedding                           # Store the sentence embedding of each sentence for a document in as an item in a dictionary

  doc_embedding = np.mean(sent_matrix, axis = 0)                                       # Compute doc embedding as mean of all the sentence embeddings 
  doc_matrix_dict[doc_id]["document_embedding"]= doc_embedding                         # Store the overall document embedding as well 

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Skipping concatenation at doc 206 sentence 92 due to [nan] error aka missing value
Skipping concatenation at doc 255 sentence 98 due to [nan] error aka missing value
Skipping concatenation at doc 255 sentence 107 due to [nan] error aka missing value
Skipping concatenation at doc 528 sentence 44 due to [nan] error aka missing value
Skipping concatenation at doc 658 sentence 225 due to [nan] error aka missing value
Skipping concatenation at doc 666 sentence 4 due to [nan] error aka missing value
Skipping concatenation at doc 888 sentence 14 due to [nan] error aka missing value
Skipping concatenation at doc 913 sentence 28 due to [nan] error aka missing value
Skipping concatenation at doc 913 sentence 149 due to [nan] error aka missing value
Skipping concatenation at doc 914 sentence 79 due to [nan] error aka missing value
Skipping concatenation at doc 1066 sentence 75 due to [nan] error aka missing value
Skipping concatenation at doc 1084 sentence 65 due to [nan] error aka missing value


In [362]:
# We now have the sentence embeddings as well as the document embedding in this dictionary 

print("Embedding for Overall Document 80 ")
print(doc_matrix_dict[80]["document_embedding"])

# Embedding for sentence 12
print("\n\nEmbedding for Sentence 1 in Document 80 ")
print(doc_matrix_dict[80][1])

Embedding for Overall Document 80 
[ 2.50772094e-01  2.07434424e-01 -2.39391313e-02 -1.85828246e-01
  1.05222334e-01  5.16955882e-02 -4.18029620e-01  8.96055067e-02
  5.30375720e-02  1.24387031e-01 -2.27594664e-02  2.69181277e-01
  1.43676549e-01  4.87895079e-02  1.91051073e-01  9.46232169e-02
 -1.02217908e-01  3.07664605e-01 -3.37000431e-02 -1.67255046e-01
  1.48605987e-01  2.62409929e+00 -7.22534710e-02 -8.03505262e-02
  1.75544009e-01 -5.67431986e-02 -9.00074481e-02  5.52811609e-02
  4.43384706e-02  2.94253222e-03 -6.15333321e-02 -7.43854059e-02
 -1.02617347e-02 -6.76434284e-03 -8.66740307e-04 -3.08234126e-01
 -5.25914338e-01 -3.25696104e-01 -1.21299262e-02  3.92275077e-02
 -1.10708819e-02 -1.14774081e-01  1.17031462e-02  3.10713242e-01
 -1.35455775e-01  1.55831467e-01  4.60764528e-01 -1.17356452e-01
  3.14563344e-02  1.91563342e-01 -5.36601985e-02 -5.68585401e-02
 -8.91179594e-02  3.39624549e-01  1.93689677e-01 -2.27715848e-02
 -7.64871847e-02 -5.65174749e-02 -2.96261575e-02  9.394

# Now we will try and find the closest sentences to the overall document embedding and use that to show the summary of the sentence 

In [478]:
from scipy import spatial 

# We choose A lecture that talks about What makes an AI Company ? 
test_doc = doc_dict[81] 
test_dict = doc_matrix_dict[81]
doc_contents = all_sentences[80]
print("Lecture Title:\n",data.loc[80,"lecture_topic"])
print("\n\nLecture Contents\n",doc_contents)
print(test_doc)

Lecture Title:
 what-makes-an-ai-company


Lecture Contents
 ['What makes a company good at AI?', 'Perhaps even more importantly, what will it take for your company to become great at using AI?', "I had previously led the Google brain team, and Baidu's AI group, which I respectively helped Google and Baidu become great AI companies.", 'So, what can you do for your company?', 'This is the lesson I had learned to washing the rise of the Internet that I think will be relevant to how all of us navigate the rise of AI.', "Let's take a look.", 'A lesson we learned from the rise of the Internet was that, if you take your favorite shopping mall.', 'So, my wife and I sometimes shop at Stanford shopping center and you build a website for the Shopping mall.', 'Maybe sell things on the website, that by itself does not turn the shopping mall into an internet company.', 'In fact, a few years ago I was speaking with the CEO of a large retail company who said to me, "Hey Andrew, I have a website, I se

### We compute cosine similarity of each sentence embedding with the overall document embedding and return the top 3 sentences as the summary 

In [479]:
demo_sent_list = []
for k,v in test_dict.items():
  if k == "document_embedding":
    demo_doc_embed = v
  else: 
    demo_sent_list.append(v)

In [480]:
demo_sent_similarity = []
for sent in demo_sent_list:
  result = 1 - spatial.distance.cosine(demo_doc_embed, sent)
  demo_sent_similarity.append(result)

In [481]:
demo_sent_index = [i for i in range(len(demo_sent_list))]

In [482]:
demo = pd.DataFrame()
demo["Sentence_Number"] = demo_sent_index
demo["Similarity"] = demo_sent_similarity


In [483]:
demo

Unnamed: 0,Sentence_Number,Similarity
0,0,0.933812
1,1,0.981189
2,2,0.928872
3,3,0.945198
4,4,0.977994
...,...,...
59,59,0.983464
60,60,0.978879
61,61,0.971466
62,62,0.979675


In [484]:
demo_sorted = demo.sort_values("Similarity", ascending=False)
demo_sorted

Unnamed: 0,Sentence_Number,Similarity
54,54,0.988777
42,42,0.984524
59,59,0.983464
1,1,0.981189
51,51,0.980456
...,...,...
38,38,0.877669
26,26,0.871399
14,14,0.857027
39,39,0.851464


In [485]:
len(doc_dict[81])

64

In [486]:
import re
sent_idx_1 = demo_sorted.loc[1,"Sentence_Number"]
sent_idx_2 = demo_sorted.loc[2,"Sentence_Number"]
sent_idx_3 = demo_sorted.loc[3,"Sentence_Number"]

final_sentences = test_doc[sent_idx_1] + test_doc[sent_idx_2] + test_doc[sent_idx_3]
final_summary = " ".join(final_sentences)
final_summary_1 = re.sub(' {2,}', '. ', final_summary)
print("Final Summary for the Document:\n")
final_summary_1

Final Summary for the Document:



"perhaps even more importantly what will it take for your company to become great at using ai. i had previously led the google brain team and baidu's ai group which i respectively helped google and baidu become great ai companies. so what can you do for your company. "

# Hence we have our final summary ready and it seems to do an okay job

### Lets have a look at a couple of more examples 

In [464]:
# We choose Another lecture
test_doc = doc_dict[122] 
test_dict = doc_matrix_dict[122]
doc_contents = all_sentences[121]
print("Lecture Title:\n",data.loc[121,"lecture_topic"])
print("\n\nLecture Contents\n",doc_contents)
print(test_doc)

Lecture Title:
 training-neural-networks


Lecture Contents
 ['Now that you have dived a little deeper into neural networks.', "Let's sort of how we can train them, some common pitfalls and something techniques to help speed up train and provide better journalism.", 'In TensorFlow using the Estimator API, using a DNNRegressor, is very similar to using a LinearRegressor, with only a few parameters for the code that need to be added.', 'We can use momentum based optimizers such as the default Adagrad, or we can try many others such as Adam.', 'Also we now have to add a parameter named hidden units, which is a list.', 'The number of items in this list is the number of hidden layers and the values of each list item is a number of neurons for that particular hidden layer.', 'You will also know there is a new parameter named dropout.', "We'll cover this and more in a few minutes.", 'But for now this is used to turn individual neurons on and off for each example in hopes of having better gene

In [465]:
demo_sent_list = []
for k,v in test_dict.items():
  if k == "document_embedding":
    demo_doc_embed = v
  else: 
    demo_sent_list.append(v)

In [466]:
demo_sent_similarity = []
for sent in demo_sent_list:
  result = 1 - spatial.distance.cosine(demo_doc_embed, sent)
  demo_sent_similarity.append(result)

In [467]:
demo_sent_index = [i for i in range(len(demo_sent_list))]

In [468]:
demo = pd.DataFrame()
demo["Sentence_Number"] = demo_sent_index
demo["Similarity"] = demo_sent_similarity


In [469]:
demo

Unnamed: 0,Sentence_Number,Similarity
0,0,0.917691
1,1,0.951502
2,2,0.966761
3,3,0.955667
4,4,0.959113
...,...,...
123,123,0.971756
124,124,0.954405
125,125,0.955035
126,126,0.923079


In [470]:
demo_sorted = demo.sort_values("Similarity", ascending=False)
demo_sorted

Unnamed: 0,Sentence_Number,Similarity
89,89,0.986942
54,54,0.985095
46,46,0.983757
24,24,0.980674
96,96,0.980638
...,...,...
36,36,0.861196
65,65,0.858809
120,120,0.820556
118,118,0.813847


In [473]:
len(doc_dict[122])

128

In [476]:
import re
sent_idx_1 = demo_sorted.loc[1,"Sentence_Number"]
sent_idx_2 = demo_sorted.loc[2,"Sentence_Number"]
sent_idx_3 = demo_sorted.loc[3,"Sentence_Number"]

final_sentences = test_doc[sent_idx_1] + test_doc[sent_idx_2] + test_doc[sent_idx_3]
final_summary = " ".join(final_sentences)
final_summary_1 = re.sub(' {2,}', '. ', final_summary)
print("Final Summary for the Document:\n")
final_summary_1

Final Summary for the Document:



"let's sort of how we can train them some common pitfalls and something techniques to help speed up train and provide better journalism. in tensorflow using the estimator api using a dnnregressor is very similar to using a linearregressor with only a few parameters for the code that need to be added. we can use momentum based optimizers such as the default adagrad or we can try many others such as adam. "