In [None]:
from __future__ import print_function
import nltk, re, pickle, os
import pandas as pd
import numpy as np
from time import time

#from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer

from nltk.corpus import stopwords
from nltk.util import ngrams

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

In [None]:
ted_main = pd.read_csv('data/ted_main.csv')
ted_trans = pd.read_csv('data/transcripts.csv')    
ted_all = pd.merge(ted_trans,right=ted_main,on='url')

In [None]:
ted_all['id'] = ted_all.index
print(len(ted_all))
talks = ted_all['transcript']
print(len(talks))

In [None]:
def clean_text(text):
    
    """ 
    Takes in a corpus of documents and cleans. ONly works with multiple docs for now
    
    1. remove parentheticals
    2. tokenize into words using wordpunct
    3. lowercase and remove stop words
    4. lemmatize 
    5. lowercase and remove stop words
    
    
    OUT: cleaned text = a list (documents) of lists (cleaned word in each doc)
    """

    lemmizer = WordNetLemmatizer()
    #stemmer = porter.PorterStemmer()

    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫',\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',"ll","didn",\
             ' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]

    cleaned_text = []
    
    for post in text:
        cleaned_words = []
        
        # remove parentheticals
        clean_parens = re.sub(r'\([^)]*\)', ' ', post)
        
        # tokenize into words
        for word  in wordpunct_tokenize(clean_parens):  
            
            # lowercase and throw out any words in stop words
            if word.lower() not in stop:
            
                # lemmatize  to roots
                low_word = lemmizer.lemmatize(word)  

                # stem and lowercase ( an alternative to lemmatize)
                #low_word = stemmer.stem(root.lower())  
            
                # keep if not in stopwords (yes, again)
                if low_word.lower() not in stop: 
                    
                    # put into a list of words for each document
                    cleaned_words.append(low_word.lower())
        
        # keep corpus of cleaned words for each document    
        cleaned_text.append(' '.join(cleaned_words))
    
    return cleaned_text

In [None]:
t0 = time()
cleaned_talks = clean_text(talks)
print("Cleaned data in %0.3fs." % (time() - t0))

In [None]:
def get_recommendations(first_article,num_of_recs,topics,ted_data, model, vectorizer, training_vectors):
    
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    
    nn = NearestNeighbors(n_neighbors=num_of_recs, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    
    results = nn.kneighbors(new_vec)
    
    recommend_list = results[1][0]
    scores = results[0]
                       
    ss = np.array(scores).flat       
    for i, resp in enumerate(recommend_list):
        print('\nID: ', + resp)
        print('Cosine Distance: ', + ss[i])  
        print('Topics: ' + topics.iloc[resp,0])
        print('URL: ' + ted_data.iloc[resp,1])
        print("TED's original tags: ")
        print(ted_data.iloc[resp,-3])
        print("\n------------------------")
        
    return recommend_list, ss  

In [None]:
def topic_mod_lda(data,topics=5,iters=10,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    """ vectorizer - turn words into numbers for each document(rows)
    then use Latent Dirichlet Allocation to get topics"""
    
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
      
    #  `fit (train), then transform` to convert text to a bag of words

    vect_data = vectorizer.fit_transform(data)
    
    
    
    lda = LatentDirichletAllocation(n_components=topics,
                                    max_iter=iters,
                                    random_state=42,
                                    learning_method='online',
                                    n_jobs=-1)
    
    lda_dat = lda.fit_transform(vect_data)
    
    
    # to display a list of topic words and their scores 
    
    def display_topics(model, feature_names, no_top_words):
        for ix, topic in enumerate(model.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lda, vectorizer.get_feature_names(),20)
    
    
    return vectorizer, vect_data, lda, lda_dat

In [None]:
t0 = time()
vect_mod, vect_data, lda_mod, lda_data = topic_mod_lda(cleaned_talks,
                                                       topics=20,
                                                       iters=10,
                                                       ngram_min=1, 
                                                       ngram_max=2, 
                                                       max_df=0.5, 
                                                       max_feats=2000)
print("LDA done in %0.3fs." % (time() - t0))

In [None]:
topic_ind = np.argmax(lda_data, axis=1)
topic_labels = pd.DataFrame(topic_ind)
topic_names = topic_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"
topic_names[topic_names==9] = "gaming"
topic_names[topic_names==10] = "architecture"
topic_names[topic_names==11] = "education"
topic_names[topic_names==12] = "neuroscience"
topic_names[topic_names==13] = "climate, energy"
topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "robotics"  
topic_names[topic_names==16] = "disease biology"
topic_names[topic_names==17] = "medicine"
topic_names[topic_names==18] = "technology, privacy"
topic_names[topic_names==19] = "war"

In [None]:
rec_list, scores = get_recommendations(cleaned_talks[804],10, topic_names, ted_all,
                                       lda_mod, vect_mod, lda_data)

In [None]:
def topic_mod_nmf(data, topics=5,iters=10,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    """ vectorizer - turn words into numbers for each document(rows)
    then use Latent Dirichlet Allocation to get topics"""
    
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
    # call `fit` to build the vocabulary
#     vectorizer.fit(data)
    
    # finally, call `transform` to convert text to a bag of words

#     vect_data = vectorizer.transform(data)
    
    vect_data = vectorizer.fit_transform(data)
    
    nmf = NMF(n_components=topics,
                max_iter=iters,
                random_state=42)
    
    nmf_dat = nmf.fit_transform(vect_data)
    
    
    # to display a list of topic words and their scores 
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(nmf, vectorizer.get_feature_names(),20)
    
    
    return vectorizer, vect_data, nmf, nmf_dat

In [None]:
t0 = time()
vect_mod, vect_data, nmf_mod, nmf_data  = topic_mod_nmf(cleaned_talks,
                                                             topics=20,
                                                             iters=100,
                                                             ngram_min=1, 
                                                             ngram_max=2, 
                                                             max_df=0.6, 
                                                             max_feats=2000)
print("NMF done in %0.3fs." % (time() - t0))

In [None]:
topic_ind = np.argmax(nmf_data, axis=1)
topic_labels = pd.DataFrame(topic_ind)
topic_names = topic_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"
topic_names[topic_names==9] = "gaming"
topic_names[topic_names==10] = "architecture"
topic_names[topic_names==11] = "education"
topic_names[topic_names==12] = "neuroscience"
topic_names[topic_names==13] = "climate, energy"
topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "robotics"  
topic_names[topic_names==16] = "disease biology"
topic_names[topic_names==17] = "medicine"
topic_names[topic_names==18] = "technology, privacy"
topic_names[topic_names==19] = "war"

In [None]:
def get_recommendations(first_article,num_of_recs,topics,ted_data, model, vectorizer, training_vectors):
    
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    
    nn = NearestNeighbors(n_neighbors=num_of_recs, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    
    results = nn.kneighbors(new_vec)
    
    recommend_list = results[1][0]
    scores = results[0]
                       
    ss = np.array(scores).flat
    for i, resp in enumerate(recommend_list):
        print('\n--- ID ---\n', + resp)
        print('--- distance ---\n', + ss[i])  
        print('--- topic ---')
        print(topics.iloc[resp,0])
        print(ted_data.iloc[resp,1])
        print('--- teds tags ---')
        print(ted_data.iloc[resp,-3])
        
    return recommend_list, ss  

In [None]:
rec_list, scores = get_recommendations(cleaned_talks[804],10, topic_names, ted_all,
                                       nmf_mod, vect_mod, nmf_data)

In [None]:
def topic_mod_lsa(data, topics=5,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    """ vectorizer - turn words into numbers for each document(rows)
    then use Latent Dirichlet Allocation to get topics"""
    
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
    # call `fit` to build the vocabulary
   
    vect_data = vectorizer.fit_transform(data)
    
    #stdScale = Normalizer()

    #vect_scale = stdScale.fit_transform(vect_data)
    
    lsa = TruncatedSVD(n_components=topics,random_state=42)
    
    lsa_dat = lsa.fit_transform(vect_data)
    
    
    # to display a list of topic words and their scores 
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lsa, vectorizer.get_feature_names(),20)
    
    
    return vectorizer, vect_data, lsa, lsa_dat

In [None]:
t0 = time()
vect_mod, vect_data, lsa_mod, lsa_data  = topic_mod_lsa(cleaned_talks,
                                                        topics=20,
                                                        ngram_min=1, 
                                                        ngram_max=2, 
                                                        max_df=0.6, 
                                                        max_feats=2000)
print("LSA done in %0.3fs." % (time() - t0))

In [None]:
topic_ind = np.argmax(lsa_data, axis=1)
topic_labels = pd.DataFrame(topic_ind)
topic_names = topic_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"
topic_names[topic_names==9] = "gaming"
topic_names[topic_names==10] = "architecture"
topic_names[topic_names==11] = "education"
topic_names[topic_names==12] = "neuroscience"
topic_names[topic_names==13] = "climate, energy"
topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "robotics"  
topic_names[topic_names==16] = "disease biology"
topic_names[topic_names==17] = "medicine"
topic_names[topic_names==18] = "technology, privacy"
topic_names[topic_names==19] = "war"

In [None]:
rec_list, scores = get_recommendations(cleaned_talks[804],10, topic_names, ted_all,
                                       lsa_mod, vect_mod, lsa_data)

In [None]:
def topic_mod_lsa_t(data, topics=5,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    """ vectorizer - turn words into numbers for each document(rows)
    then use Latent Dirichlet Allocation to get topics"""
    
    
    vectorizer = TfidfVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
    vect_data = vectorizer.fit_transform(data)
    
    stdScale = Normalizer()

    vect_scale = stdScale.fit_transform(vect_data)
    lsa_t = TruncatedSVD(n_components=topics,random_state=42)
    
    lsa_t_dat = lsa_t.fit_transform(vect_scale)
        
    # to display a list of topic words and their scores 
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lsa_t, vectorizer.get_feature_names(),20)
    
    
    return vectorizer, vect_data, lsa_t, lsa_t_dat

In [None]:
t0 = time()
vect_mod, vect_data, lsa_t_mod, lsa_t_data  = topic_mod_lsa_t(cleaned_talks,
                                                              topics=20,
                                                              ngram_min=1, 
                                                              ngram_max=2, 
                                                              max_df=0.6, 
                                                              max_feats=2000)
print("LSA_T done in %0.3fs." % (time() - t0))

In [None]:
topic_ind = np.argmax(lsa_t_data, axis=1)
topic_labels = pd.DataFrame(topic_ind)
topic_names = topic_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"
topic_names[topic_names==9] = "gaming"
topic_names[topic_names==10] = "architecture"
topic_names[topic_names==11] = "education"
topic_names[topic_names==12] = "neuroscience"
topic_names[topic_names==13] = "climate, energy"
topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "robotics"  
topic_names[topic_names==16] = "disease biology"
topic_names[topic_names==17] = "medicine"
topic_names[topic_names==18] = "technology, privacy"
topic_names[topic_names==19] = "war"

In [None]:
def get_recommendations(first_article,num_of_recs,topics,ted_data, model, vectorizer, training_vectors):
    
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    
    nn = NearestNeighbors(n_neighbors=num_of_recs, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    
    results = nn.kneighbors(new_vec)
    
    recommend_list = results[1][0]
    scores = results[0]
                       
    ss = np.array(scores).flat
    for i, resp in enumerate(recommend_list):
        print('\n--- ID ---\n', + resp)
        print('--- distance ---\n', + ss[i])  
        print('--- topic ---')
        print(topics.iloc[resp,0])
        print(ted_data.iloc[resp,1])
        print('--- teds tags ---')
        print(ted_data.iloc[resp,-3])
        
    return recommend_list, ss  

In [None]:
rec_list, scores = get_recommendations(cleaned_talks[804],10, topic_names, ted_all,
                                       lsa_t_mod, vect_mod, lsa_t_data)