In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import random
import nltk
import re
import string
from bs4 import BeautifulSoup

### LOAD DATASET

In [9]:
data = pd.read_csv('../data/transcripts.csv')
data.columns

Index(['transcript', 'url'], dtype='object')

In [10]:
data.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   transcript  2467 non-null   object
 1   url         2467 non-null   object
dtypes: object(2)
memory usage: 38.7+ KB


#### Data Preprocessing

In [13]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### CREATING A VOCUBLARY FROM THE DATA USING COUNT VECTORIZER

In [14]:
count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
term_matrix =  count_vect.fit_transform(data['transcript'].values.astype('U')) #including words that occur less than 80% of the time in the document
'''stop words have also been removed since they barely contribute significantly to the vocabulary'''
term_matrix

<2467x34029 sparse matrix of type '<class 'numpy.int64'>'
	with 1108062 stored elements in Compressed Sparse Row format>

From the results above, every word in the document is represented by a 34029 dimensional vector. i.e we have a vocabulary of 34029 words

### LDA

In [15]:
#we now use LDA to create topics based on the probability of each word in the document
lda = LatentDirichletAllocation(n_components=5, random_state=42) #we set n = 5 as our initial guess of topics in the data
lda.fit(term_matrix)

In [16]:
#top 50 words in the vocubulary
for i in range(51):
    random_word = random.randint(0, len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_word])



morphs
prosecuted
rosie
berkeley
dues
promising
450
arising
multimillion
myspace
tablet
authoritarianism
textile
mountainsides
pneumonia
union
collectible
fissures
herders
hypocrisy
tremendous
converter
emits
apocalyptic
researcher
sounds
obsidian
claps
stunts
degrades
recommended
prisoner
metabolic
vitality
l1
shanghai
magnifying
mesmerizing
materialize
opinions
taxonomy
blameless
steeper
lick
emerge
gigabits
topos
knowhow
login
colic
urging


In [17]:
#displaying the first topic
first_topic = lda.components_[0]
first_topic# the output is a vector. from the vector we can then obtain the words from the count_vectorizer featu

array([1.07009199e+01, 8.75603006e+02, 3.20657303e-01, ...,
       2.19612451e+00, 2.21755631e-01, 2.01565888e-01])

In [18]:
#obtaining the top words in the first topic
top_topic_words = first_topic.argsort()[-10:]
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

thing
different
laughter
kind
new
right
look
little
things
actually


In [19]:
#displaying the top 20 words in each of the topics
for i,topic in enumerate(lda.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 10 words for topic #0:
['work', 'lot', 'light', 'got', 'earth', 'space', 'want', 'll', 'life', 'use', 'thing', 'different', 'laughter', 'kind', 'new', 'right', 'look', 'little', 'things', 'actually']


Top 10 words for topic #1:
['look', 'women', 'tell', 'come', 'didn', 'love', 'work', 'little', 'good', 'day', 'thing', 'did', 'got', 'things', 'right', 'life', 'say', 'want', 'said', 'laughter']


Top 10 words for topic #2:
['internet', 'said', 'got', 'public', 'did', 'say', 'power', 'thing', 'lot', 'today', 'country', 'government', 'need', 'right', 'work', 'things', 'actually', 'city', 'want', 'new']


Top 10 words for topic #3:
['want', 'actually', 'today', '000', 'use', 'carbon', 'look', 'got', 'lot', 'climate', 'change', 'fish', 'women', 'energy', 'oil', 'year', 'need', 'percent', 'water', 'food']


Top 10 words for topic #4:
['today', 'cells', 'good', 'let', 'right', 'work', 'problem', 'human', 'data', 'new', 'cancer', 'want', 'different', 'things', 'look', 'health', 'brain', 'n

### TOPIC TRACKING USING NON-NEGATIVE MATRIX FACTORIZATION

In [21]:
tfidf_vect = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
term_matrix_tfidf = tfidf_vect.fit_transform(data['transcript'].values.astype('U'))

In [22]:
term_matrix_tfidf


<2467x34029 sparse matrix of type '<class 'numpy.float64'>'
	with 1108062 stored elements in Compressed Sparse Row format>

In [23]:
nmf = NMF(n_components=5, random_state=42)
nmf.fit(term_matrix_tfidf)



In [24]:
#obtaining random words from our new vocab
import random

for i in range(10):
    random_id = random.randint(0,len(tfidf_vect.get_feature_names()))
    print(tfidf_vect.get_feature_names()[random_id])



uppercase
gsm
baptism
kenyon
untouchable
matrices
pans
geometric
subcontinent
cures


In [25]:
#top words in the first topic
first_topic = nmf.components_[0]
top_topic_words = first_topic.argsort()[-20:]
for i in top_topic_words:
    print(tfidf_vect.get_feature_names()[i])

ll
sort
use
technology
computer
want
light
new
earth
water
look
thing
space
right
design
kind
little
laughter
things
actually


In [26]:
#obtaining top 20 words for each of the topics
for i,topic in enumerate(nmf.components_):
    print(f'Top 20 words for topic #{i}:')
    print([tfidf_vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 20 words for topic #0:
['ll', 'sort', 'use', 'technology', 'computer', 'want', 'light', 'new', 'earth', 'water', 'look', 'thing', 'space', 'right', 'design', 'kind', 'little', 'laughter', 'things', 'actually']


Top 20 words for topic #1:
['went', 'family', 'girls', 'didn', 'man', 'woman', 'did', 'got', 'want', 'day', 'children', 'love', 'say', 'kids', 'life', 'school', 'men', 'laughter', 'said', 'women']


Top 20 words for topic #2:
['piano', 'instrument', 'laughter', 'sounds', 'singing', 'playing', 'violin', 'musicians', 'video', 'classical', 'piece', 'hear', 'orchestra', 'ends', 'musical', 'song', 'play', 'guitar', 'sound', 'music']


Top 20 words for topic #3:
['today', 'business', 'economy', 'health', 'change', 'economic', 'growth', 'social', 'city', 'cities', 'money', 'china', 'dollars', 'need', 'government', 'global', 'country', 'africa', 'percent', 'countries']


Top 20 words for topic #4:
['medicine', 'genes', 'actually', 'dna', 'human', 'drugs', 'tumor', 'health', 'stem',

The purpose of this notebook was to attempt to find a means of identifying the main topics under a given TED TALKS transcripts. We decided to use text clustering using SKlearn's Latent DirichletAllocation and Non-negative matrix factorization. The results  show various topics discussed based on the most appearing words within them. For instance, the first topic using LDA talks about life in particular and how the little things matter.