In [1]:
#https://www.kaggle.com/rounakbanik/ted-talks#transcripts.csv

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import re
from nltk.corpus import stopwords
from nltk.stem import porter, WordNetLemmatizer
import pandas as pd

#For  New York -> New_York 
from nltk.tokenize  import MWETokenizer  # multi - word expression 
from nltk.tokenize  import word_tokenize

import pickle
import numpy as np

In [2]:
import json
from pymongo import MongoClient


client = MongoClient()
db = client.TedTalkdb
transcripts_collection = db.transcripts #make a collection in our databased called new

# Defining NLP Preprocessor and Cleaning Function

In [3]:
def clean_document(document): 
    #Return a cleaned string (or cleaned transcript)
    
    #print("INSIDE:")
    #print(document)
    mwe_tokenizer = MWETokenizer([( 'United' , 'States' ), ( 'New' ,  'York' ), ('High', 'School'), ('high', 'school'), ('New' ,'York','City'), ('New ','York', 'Times')])
    lemmizer = WordNetLemmatizer()
    stopword_list = stopwords.words()
    stopword_list += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';',\
             '.\'"','[',']','—',".\'", 'ok','okay', 'felt', 'little','leave',' told ', 'sort',' told', 'yes','yeah','ya','stuff', ' 000 ',' em ','get','got',\
             ' oh ', 'oh',' oh', 'oh ','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", ' t ','ve','guy', ' guy ', 're', 'every', 'single', 'old',\
            'year', 'ago', 'let', 'take' ] #told, went, came, able, example, hand, maybe, try, looking
    document = re.sub(r'\(.+?\)', ' ', document)
    document = re.sub(r'\[.+?\]', ' ', document)
    document = re.sub(r'[^\w\s]',' ', document)
    document = re.sub('\w*\d\w*', ' ', document)
    document = mwe_tokenizer.tokenize(word_tokenize(document))
    document = ' '.join(document)
    cleaned_words = []
    for word in document.split():
        low_word = lemmizer.lemmatize(word.lower())
        #low_word = stemmer.stem(word.lower())
        if low_word not in stopword_list:
            cleaned_words.append(low_word)
    return cleaned_words

In [4]:
class nlp_preprocessor:
    
    def __init__(self, vectorizer=None, tokenizer=None, cleaning_function=None): #vectorizer=CountVectorizer()
        if not tokenizer:
            tokenizer = self.splitter
        if not cleaning_function:
            cleaning_function = self.clean_document#
        self.tokenizer = tokenizer
        self.cleaning_function = cleaning_function#
        self.vectorizer = vectorizer
        self._is_fit = False
        
    def splitter(self, text):
        """
        Default tokenizer that splits on spaces naively
        """
        return text.split(' ')
    
    def fit(self, clean_text):
        """
        Cleans the data and then fits the vectorizer with
        the user provided text
        """
        self.vectorizer.fit(clean_text)
        self._is_fit = True
        
    def transform(self, clean_text):
        """
        Cleans any provided data and then transforms the data into
        a vectorized format based on the fit function. Returns the
        vectorized form of the data.
        """
        if not self._is_fit:
            raise ValueError("Must fit the models before transforming!")
        #clean_text = self.cleaning_function(clean_text)#, self.tokenizer, self.stemmer) #pass an uncleaned version
        #should this be another function that cleans EXCLUSIVELY WITH DATA THAT IS ["","",...,""]
        return self.vectorizer.transform(clean_text)
    

# Application on Data

In [5]:
from pprint import pprint

#(x['transcript'] for x in transcripts_collection.find())
cursor = transcripts_collection.find()#transcripts_collection.aggregate([{'$sample':{'size': 2550}}])

# This will go through every transcript and clean it through an instanciation of the class
cleaned_document = []
for document in cursor:
    #print(document)
    clean_text = clean_document(document['transcript'])
    cleaned_document.append(' '.join(clean_text))

In [None]:
#Pickle the cleaned_document for recomendation notebook
with open('cleaned_talks.pkl', 'wb') as picklefile:
    pickle.dump(cleaned_document, picklefile)


In [6]:
#This is the best one for descent result when using LDA

nlp = nlp_preprocessor(CountVectorizer("\\b[a-z][a-z]+\\b", ngram_range=(1, 2), max_df = 0.4, min_df= 0, stop_words = 'english', max_features=3500), tokenizer=None, 
                 cleaning_function=clean_document)

In [7]:
nlp1 = nlp_preprocessor(CountVectorizer("\\b[a-z][a-z]+\\b", ngram_range=(1, 2), max_df = 0.4, min_df= 0, stop_words = 'english', max_features=3000), tokenizer=None, 
                 cleaning_function=clean_document) 

In [34]:
nlp2 = nlp_preprocessor(CountVectorizer("\\b[a-z][a-z]+\\b", ngram_range=(1, 2), max_df = 0.4, min_df=.1 , stop_words = 'english', max_features=2000), tokenizer=None, 
                 cleaning_function=clean_document) 

In [9]:
nlp.fit(cleaned_document)

vectorized_docs_dense = nlp.transform(cleaned_document).toarray() #keep array if you want to construct data frame! or use the X
#print(vectorized_docs_dense)

vectorized_docs = nlp.transform(cleaned_document) #keep array if you want to construct data frame! or use the X
#print(vectorized_docs)

#print(pd.DataFrame(vectorized_docs_dense, columns=nlp.vectorizer.get_feature_names())) #is it vectorized_docs?

## Decomposition and Clustering

In [10]:
#ngrams (1,2), max_df = .4, min_df =0, max_feats = 3,500, n_topics=15, n_iters=70, rand = 42
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import Normalizer
n_topics = 15
n_iter = 90
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=n_iter,
                                random_state=42,
                               learning_method='online')

data = lda.fit_transform(vectorized_docs) #nlp.transform(cleaned_document)) #lda.fit_transform(X)



In [None]:
# from sklearn.decomposition import NMF, TruncatedSVD
# n_comp = 17
# lsa_cv = TruncatedSVD(n_components=n_comp)
# nmf_cv = NMF(n_components=n_comp)
# lsa_cv_data = lsa_cv.fit_transform(vectorized_docs) #The computerized classification
# nmf_cv_data = nmf_cv.fit_transform(vectorized_docs) #The computerized classification

In [None]:
# X = nlp.transform(cleaned_document) #Transform data through count vectorizer
# type(X) #<--- pass this to LDA

## Display Topics

In [11]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [12]:
display_topics(lda,nlp.vectorizer.get_feature_names(),20)


Topic  0
woman, love, girl, family, child, mother, friend, told, father, boy, night, young, knew, moment, saw, remember, later, asked, month, happened

Topic  1
dollar, money, africa, company, business, market, billion, economy, cost, global, economic, china, india, government, product, growth, job, term, poor, oil

Topic  2
brain, child, baby, study, neuron, social, data, behavior, sex, animal, area, health, memory, age, rate, difference, level, mental, sleep, activity

Topic  3
city, car, energy, water, air, foot, mile, hour, fly, power, space, road, street, wind, light, half, map, solar, bee, building

Topic  4
government, american, political, community, state, society, power, group, law, democracy, social, violence, city, black, public, police, nation, america, movement, issue

Topic  5
cell, cancer, patient, disease, body, drug, health, doctor, blood, gene, dna, medical, medicine, heart, treatment, virus, hospital, genome, genetic, surgery

Topic  6
technology, computer, robot, m

In [None]:
#PICKLE LDA AND COUNT VECTOR FOR RECOMMENDATION
with open('vectorizer.pkl', 'wb') as picklefile:
    pickle.dump(nlp.vectorizer, picklefile)
with open('lda.pkl', 'wb') as picklefile:
    pickle.dump(lda, picklefile)
with open('lda_data.pkl', 'wb') as picklefile:
    pickle.dump(data, picklefile)

In [None]:
topic_ind = np.argmax(data, axis=1)
topic_ind.shape
y=topic_ind

# create text labels for plotting
tsne_labels = pd.DataFrame(y)

# save to csv
tsne_labels.to_csv('tsne_labels.csv')

In [None]:
topic_names = tsne_labels
topic_names[topic_names==0] = "Family"
topic_names[topic_names==1] = "Global Economy"
topic_names[topic_names==2] = "Neurology"
topic_names[topic_names==3] = "Transportation"
topic_names[topic_names==4] = "Politics"
topic_names[topic_names==5] = "Diseases"
topic_names[topic_names==6] = "Techonology"
topic_names[topic_names==7] = "Nature"
topic_names[topic_names==8] = "Social Media"
topic_names[topic_names==9] = "Language"
topic_names[topic_names==10] = "Self-Help"
topic_names[topic_names==11] = "Education"
topic_names[topic_names==12] = "Architecture"
topic_names[topic_names==13] = "Multimedia"
topic_names[topic_names==14] = "Space"


In [None]:
topic_names
#df['a'].value_counts()

In [None]:
#save text labels to csv and pkl for plotting

topic_names.to_csv('topic_names.csv')

with open('topic_names.pkl', 'wb') as picklefile:
    pickle.dump(topic_names, picklefile)


In [17]:
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display

# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()

# Create the visualization
vis = pyLDAvis.sklearn.prepare(lda, nlp.transform(cleaned_document), nlp.vectorizer) #nlp.transform(cleaned_document) = X
                                        #lda_cv_data?               
# Export as a standalone HTML web page  
# pyLDAvis.save_html(vis, 'lda.html')

# Let's view it!
display(vis)

#(lda_mod(lda_rule), vect_data(fit_transf(data)), vect_mod(vectrule))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# Second Best LDA - 
#### ngrams (1,2), max_df = .4, min_df =0, max_feats = 3,500, n_topics=17, n_iters=70, rand = 42
Topic  0
brain, patient, health, disease, cancer, doctor, drug, medical, body, treatment, hospital, heart, study, baby, child, blood, medicine, surgery, research, neuron

Topic  1
government, law, american, police, case, security, violence, state, military, prison, group, attack, united_states, weapon, soldier, crime, killed, conflict, gun, afghanistan

Topic  2
africa, india, global, china, government, economy, economic, child, african, growth, state, aid, poor, society, poverty, chinese, billion, population, community, family

Topic  3
computer, technology, game, machine, robot, video, play, sound, language, data, learning, student, brain, information, device, learn, music, algorithm, tool, pretty

Topic  4
water, ocean, fish, animal, coral, shark, boat, whale, plastic, specie, reef, mosquito, marine, dolphin, underwater, area, island, swim, deep, malaria

Topic  5
love, kid, friend, book, moment, told, night, child, mother, remember, week, later, family, head, minute, month, hour, room, knew, saw

Topic  6
data, internet, information, medium, online, network, social, phone, technology, web, open, book, google, facebook, page, government, friend, message, power, digital

Topic  7
design, building, art, project, space, create, image, artist, material, designer, piece, architecture, object, process, form, wall, museum, painting, built, light

Topic  8
science, brain, theory, rule, pattern, nature, model, physic, mind, reality, simple, scientist, force, law, consciousness, self, answer, sense, structure, line

Topic  9
planet, earth, light, universe, space, star, mar, galaxy, sun, billion, black, image, solar, dark, telescope, hole, energy, picture, fly, moon

Topic  10
city, car, energy, power, oil, street, building, air, fuel, technology, nuclear, road, mile, electricity, build, hour, solar, vehicle, built, half

Topic  11
child, school, kid, family, community, social, student, parent, self, experience, education, teacher, culture, young, society, group, black, american, love, learn

Topic  12
food, plant, climate, carbon, eat, farmer, water, climate change, bee, energy, global, vaccine, natural, grow, waste, billion, planet, feed, crop, solution

Topic  13
tree, water, foot, forest, earth, ice, animal, river, air, specie, cloud, mountain, bird, nature, planet, land, area, surface, body, picture

Topic  14
woman, girl, sex, boy, film, female, love, male, young, gender, sexual, baby, daughter, movie, marriage, partner, worker, village, age, feminist

Topic  15
cell, gene, dna, body, animal, molecule, genome, cancer, virus, genetic, specie, bacteria, technology, organism, biology, protein, tissue, environment, lab, evolution

Topic  16
company, money, dollar, business, market, cost, value, product, buy, pay, job, industry, decision, price, organization, financial, choice, spend, billion, billion dollar


# Other Subpar Results

In [18]:
from sklearn.decomposition import NMF, TruncatedSVD
n_comp = 20 #17
nmf_cv = NMF(n_components=n_comp)
nmf_cv_data = nmf_cv.fit_transform(vectorized_docs) #The computerized classification

In [19]:
display_topics(nmf_cv,nlp.vectorizer.get_feature_names(),20)


Topic  0
love, book, experience, friend, moment, god, mind, music, told, art, somebody, self, asked, feeling, talking, sound, head, night, remember, happened

Topic  1
brain, neuron, body, memory, area, animal, mind, sleep, ability, region, control, fly, study, behavior, activity, light, cortex, consciousness, human brain, arm

Topic  2
woman, girl, sex, gender, boy, female, young, mother, pm, male, violence, heart, black, told, feminist, vagina, daughter, job, sexual, issue

Topic  3
cancer, tumor, body, blood, disease, drug, breast, cell, vessel, patient, muscle, treatment, blood vessel, protein, breast cancer, lung, tissue, field, stage, doctor

Topic  4
water, ocean, fish, animal, specie, shark, area, tree, coral, forest, plant, ice, foot, surface, river, whale, island, nature, light, land

Topic  5
government, money, africa, company, global, dollar, social, business, china, market, society, state, economic, political, economy, growth, india, democracy, job, term

Topic  6
kid, sc

In [35]:
n_comp2 = 11
lsa_cv = TruncatedSVD(n_components=n_comp2)
lsa_cv_data = lsa_cv.fit_transform(vectorized_docs) #The computerized classification

In [46]:
display_topics(lsa_cv,nlp.vectorizer.get_feature_names(),10)


Topic  0
woman, child, brain, technology, kid, data, school, city, love, water

Topic  1
brain, cell, body, cancer, light, neuron, technology, animal, computer, planet

Topic  2
brain, woman, cell, cancer, child, patient, disease, body, girl, neuron

Topic  3
cancer, cell, woman, disease, patient, drug, water, health, tumor, blood

Topic  4
woman, planet, earth, water, universe, space, light, black, star, galaxy

Topic  5
brain, woman, city, government, power, china, global, energy, neuron, market

Topic  6
child, food, water, brain, city, kid, school, family, animal, ocean

Topic  7
city, building, design, car, cell, woman, space, street, cancer, project

Topic  8
data, child, universe, city, galaxy, black, space, hole, black hole, star

Topic  9
data, water, robot, woman, car, food, computer, technology, child, machine

Topic  10
data, water, city, cancer, patient, ocean, love, health, doctor, fish


In [None]:
transcripts_collection.find({"transcript": {"$regex": "love"}})

In [None]:
transcripts_collection.find({"transcript": {"$regex": "love"}}).count()