In [17]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find_all('p')]
    print(url)
    return text

# URLs of transcripts in scope
urls = ["https://www.coursera.org/learn/classification-vector-spaces-in-nlp/lecture/gNXI3/vocabulary-feature-extraction"]

In [22]:
transcripts = [url_to_transcript(u) for u in urls]

https://www.coursera.org/learn/classification-vector-spaces-in-nlp/lecture/gNXI3/vocabulary-feature-extraction


In [47]:
!mkdir transcripts

with open("Transcripts.txt", "wb") as file:
    pickle.dump(transcripts[0], file)
    paragraph = transcripts[0]
str1 = " "
str1 = str1.join(paragraph)

A subdirectory or file transcripts already exists.


In [53]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# with open("Transcripts.txt", "rb") as file:
#     paragraph = pickle.load(file)
    
sentences = nltk.sent_tokenize(str1)
lemmatizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    
print(sentences)

['4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .', 'By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !', 'This Specialization designed taught two expert NLP , machine learning , deep learning .', 'Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .', 'Å\x81ukasz Kaiser Staff Research Scientist Google Brain co-author

In [54]:
data = {}
for i in range(len(sentences)):
    data[i] = sentences[i]
data

{0: '4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .',
 1: 'By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !',
 2: 'This Specialization designed taught two expert NLP , machine learning , deep learning .',
 3: 'Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .',
 4: 'Å\x81ukasz Kaiser Staff Research Scientist Goo

In [58]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

In [59]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}
data_combined

{0: ['4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .'],
 1: ['By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , even built chatbot !'],
 2: ['This Specialization designed taught two expert NLP , machine learning , deep learning .'],
 3: ['Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .'],
 4: ['Å\x81ukasz Kaiser Staff Research Scie

In [60]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
0,"4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Nat..."
1,"By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , ..."
2,"This Specialization designed taught two expert NLP , machine learning , deep learning ."
3,Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .
4,"Åukasz Kaiser Staff Research Scientist Google Brain co-author Tensorflow , Tensor2Tensor Trax library , Transformer paper ."
5,"Machine Translation , Word Embeddings , Locality-Sensitive Hashing , Sentiment Analysis , Vector Space Models 4.6 ( 3,340 rating ) HA Aug 9 , 2020..."
6,"The lecture exciting detailed , though little hard straight forward sometimes , Youtube helped Regression model ."
7,"Other , I informative fun ."
8,"From lesson Sentiment Analysis Logistic Regression Learn extract feature text numerical vector , build binary classifier tweet using logistic regr..."
9,Instructor Instructor Senior Curriculum Developer


In [62]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc[0]

'4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Natural Language Processing Specialization , : ) Perform sentiment analysis tweet using logistic regression naÃ¯ve Bayes , b ) Use vector space model discover relationship word use PCA reduce dimensionality vector space visualize relationship , c ) Write simple English French translation algorithm using pre-computed word embeddings locality-sensitive hashing relate word via approximate k-nearest neighbor search .'

In [63]:
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [64]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course natural language processing specialization this course video transcript in course natural language pro...
1,by end specialization designed nlp application perform questionanswering sentiment analysis created tool translate language summarize text even...
2,this specialization designed taught two expert nlp machine learning deep learning
3,younes bensouda mourri instructor ai stanford university also helped build deep learning specialization
4,åukasz kaiser staff research scientist google brain coauthor tensorflow trax library transformer paper
5,machine translation word embeddings localitysensitive hashing sentiment analysis vector space models rating ha aug one best course atte...
6,the lecture exciting detailed though little hard straight forward sometimes youtube helped regression model
7,other i informative fun
8,from lesson sentiment analysis logistic regression learn extract feature text numerical vector build binary classifier tweet using logistic regre...
9,instructor instructor senior curriculum developer


In [65]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [66]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
0,rating â â students enrolled course natural language processing specialization this course video transcript in course natural language pro...
1,by end specialization designed nlp application perform questionanswering sentiment analysis created tool translate language summarize text even...
2,this specialization designed taught two expert nlp machine learning deep learning
3,younes bensouda mourri instructor ai stanford university also helped build deep learning specialization
4,åukasz kaiser staff research scientist google brain coauthor tensorflow trax library transformer paper
5,machine translation word embeddings localitysensitive hashing sentiment analysis vector space models rating ha aug one best course atte...
6,the lecture exciting detailed though little hard straight forward sometimes youtube helped regression model
7,other i informative fun
8,from lesson sentiment analysis logistic regression learn extract feature text numerical vector build binary classifier tweet using logistic regre...
9,instructor instructor senior curriculum developer


In [67]:
# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
0,"4.6 ( 3,340 rating ) Â |Â 99K Students Enrolled Course 1 4 Natural Language Processing Specialization This Course Video Transcript In Course 1 Nat..."
1,"By end Specialization , designed NLP application perform question-answering sentiment analysis , created tool translate language summarize text , ..."
2,"This Specialization designed taught two expert NLP , machine learning , deep learning ."
3,Younes Bensouda Mourri Instructor AI Stanford University also helped build Deep Learning Specialization .
4,"Åukasz Kaiser Staff Research Scientist Google Brain co-author Tensorflow , Tensor2Tensor Trax library , Transformer paper ."
5,"Machine Translation , Word Embeddings , Locality-Sensitive Hashing , Sentiment Analysis , Vector Space Models 4.6 ( 3,340 rating ) HA Aug 9 , 2020..."
6,"The lecture exciting detailed , though little hard straight forward sometimes , Youtube helped Regression model ."
7,"Other , I informative fun ."
8,"From lesson Sentiment Analysis Logistic Regression Learn extract feature text numerical vector , build binary classifier tweet using logistic regr..."
9,Instructor Instructor Senior Curriculum Developer


In [69]:
# pickle it for later use
data_df.to_pickle("corpus.pkl")

In [70]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,ai,algorithm,analysis,application,approximate,assignment,attented,aug,awesome,bayes,...,vector,video,visualize,waiting,wasnnto,week,word,write,younes,youtube
0,0,1,1,0,1,0,0,0,0,1,...,2,1,1,0,0,0,3,1,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,1,1,2,1,0,...,1,0,0,1,1,1,1,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
# pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [73]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("count_vectorizer.pkl", "wb"))