In [1]:
from __future__ import print_function
import nltk, re, pickle, os
import pandas as pd
import numpy as np

#from textblob import TextBlob
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer

from nltk.corpus import stopwords
from nltk.util import ngrams

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation,  TruncatedSVD, NMF
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing  import  StandardScaler

import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D

In [2]:
ted_main = pd.read_csv('/Users/Rithika/ted_main.csv')
ted_trans = pd.read_csv('/Users/Rithika/transcripts.csv')    
ted_all = pd.merge(ted_trans,right=ted_main,on='url')

In [4]:
with open('ted_all.pkl', 'wb') as picklefile:
    pickle.dump(ted_all, picklefile)

In [3]:
ted_all['id'] = ted_all.index

In [4]:
len(ted_all)

2467

In [5]:
talks = ted_all['transcript']

In [6]:
len(talks)

2467

In [7]:
# a function to clean one document only

def clean_text_onedoc(text):

    lemmizer = WordNetLemmatizer()
    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';',\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ','get','got',\
             ' oh ','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]
    
    for word in wordpunct_tokenize(text): 
        cleaned = []
        if word.lower() not in stop:
            keepw = lemmizer.lemmatize(word)
            if keepw.lower not in stop:
                cleaned.append(keepw.lower())
                
    return cleaned

In [8]:
def clean_text(text):
    
    """ 
    Takes in a corpus of documents and cleans. ONly works with multiple docs for now
    
    1. remove parentheticals
    2. tokenize into words using wordpunct
    3. lowercase and remove stop words
    4. lemmatize 
    5. lowercase and remove stop words
    
    
    OUT: cleaned text = a list (documents) of lists (cleaned word in each doc)
    """

    lemmizer = WordNetLemmatizer()
    #stemmer = porter.PorterStemmer()

    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫',\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',\
             ' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]
    #stop = set(stop)

    cleaned_text = []
    
    for post in text:
        cleaned_words = []
        
        # remove parentheticals
        clean_parens = re.sub(r'\([^)]*\)', ' ', post)
        
        # tokenize into words
        for word  in wordpunct_tokenize(clean_parens):  
            
            # lowercase and throw out any words in stop words
            if word.lower() not in stop:
            
                # lemmatize  to roots
                low_word = lemmizer.lemmatize(word)  

                # stem and lowercase ( an alternative to lemmatize)
                #low_word = stemmer.stem(root.lower())  
            
                # keep if not in stopwords (yes, again)
                if low_word.lower() not in stop: 
                    
                    # put into a list of words for each document
                    cleaned_words.append(low_word.lower())
        
        # keep corpus of cleaned words for each document    
        cleaned_text.append(' '.join(cleaned_words))
    
    return cleaned_text

In [9]:
cleaned_talks = clean_text(talks)

In [10]:
with open('cleaned_talks.pkl', 'wb') as picklefile:
    pickle.dump(cleaned_talks, picklefile)

In [11]:
# CountVectorizer is a class; so `vectorizer` below represents an instance of that object.
c_vectorizer = CountVectorizer(ngram_range=(1,3), 
                             stop_words='english', 
                             max_df = 0.6, 
                             max_features=10000)

t_vectorizer = TfidfVectorizer(ngram_range=(1, 3),  
                                   stop_words='english', 
                                   token_pattern="\\b[a-z][a-z]+\\b",
                                   lowercase=True,
                                   max_df = 0.6)


# call `fit` to build the vocabulary
c_vectorizer.fit(cleaned_talks)
# finally, call `transform` to convert text to a bag of words
c_x = c_vectorizer.transform(cleaned_talks)


# call `fit` to build the vocabulary
t_vectorizer.fit(cleaned_talks)
# finally, call `transform` to convert text to a bag of words
t_x = t_vectorizer.transform(cleaned_talks)

In [6]:
#open metadata and cleaned talks

with open('ted_all.pkl', 'rb') as picklefile:
    ted_all = pickle.load(picklefile)

    
with open('cleaned_talks.pkl', 'rb') as picklefile:
    cleaned_talks = pickle.load(picklefile) 

In [7]:
def topic_mod_lda(data,topics=5,iters=10,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    """ vectorizer - turn words into numbers for each document(rows)
    then use Latent Dirichlet Allocation to get topics"""
    
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
      
    #  `fit (train), then transform` to convert text to a bag of words

    vect_data = vectorizer.fit_transform(data)
    
    
    
    lda = LatentDirichletAllocation(n_components=topics,
                                    max_iter=iters,
                                    random_state=42,
                                    learning_method='online',
                                    n_jobs=-1)
    
    lda_dat = lda.fit_transform(vect_data)
    
    
    # to display a list of topic words and their scores 
    
    def display_topics(model, feature_names, no_top_words):
        for ix, topic in enumerate(model.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lda, vectorizer.get_feature_names(),20)
    
    
    return vectorizer, vect_data, lda, lda_dat

In [8]:
vect_mod, vect_data, lda_mod, lda_data = topic_mod_lda(cleaned_talks,
                                                       topics=20,
                                     iters=100,
                                     ngram_min=1, 
                                     ngram_max=2, 
                                     max_df=0.5, 
                                     max_feats=2000)

Topic  0
brain cell body patient blood neuron heart tissue organ surgery stem disease skin muscle bone sugar stem cell arm mouse animal
Topic  1
school child kid student teacher education girl parent learning learn class high old teach family young year old college high school percent
Topic  2
animal specie tree fish forest plant water nature bird coral 000 bee creature insect ant river living area natural dinosaur
Topic  3
country percent global million china united billion 000 war economy states united states india dollar growth economic climate africa population number
Topic  4
earth planet water ocean sea solar ice air mars space sun energy 000 mile foot cloud surface atmosphere temperature wind
Topic  5
government company country political social society power community business group law organization leader africa public value money state issue democracy
Topic  6
rule compassion moral flag bread zero stage non sum bad understand news good news transformation nature enemy win try 

In [9]:
topic_ind = np.argmax(lda_data, axis=1)
topic_ind.shape

(2467,)

In [12]:
y=topic_ind
topic_labels = pd.DataFrame(y)
topic_names = topic_labels
topic_names[topic_names==0] = "family"
topic_names[topic_names==1] = "agriculture"
topic_names[topic_names==2] = "space"
topic_names[topic_names==3] = "environment"
topic_names[topic_names==4] = "global economy"
topic_names[topic_names==5] = "writing"
topic_names[topic_names==6] = "sounds"
topic_names[topic_names==7] = "belief, mortality"
topic_names[topic_names==8] = "transportation"

topic_names[topic_names==9] = "gaming"
topic_names[topic_names==10] = "architecture"
topic_names[topic_names==11] = "education"

topic_names[topic_names==12] = "neuroscience"
topic_names[topic_names==13] = "climate, energy"

topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "robotics"  
topic_names[topic_names==16] = "disease biology"
topic_names[topic_names==17] = "medicine"
topic_names[topic_names==18] = "technology, privacy"
topic_names[topic_names==19] = "war"

In [14]:
#save text labels to csv and pkl for plotting

topic_names.to_csv('topic_names.csv')

with open('topic_names.pkl', 'wb') as picklefile:
    pickle.dump(topic_names, picklefile)

In [15]:
def get_recommendations(first_article,num_of_recs,topics,ted_data, model, vectorizer, training_vectors):
    
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    
    nn = NearestNeighbors(n_neighbors=num_of_recs, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    
    results = nn.kneighbors(new_vec)
    
    recommend_list = results[1][0]
    scores = results[0]
                       
    ss = np.array(scores).flat
    for i, resp in enumerate(recommend_list):
        print('\n--- ID ---\n', + resp)
        print('--- distance ---\n', + ss[i])  
        print('--- topic ---')
        print(topics.iloc[resp,0])
        print(ted_data.iloc[resp,1])
        print('--- teds tags ---')
        print(ted_data.iloc[resp,-3])
        
    return recommend_list, ss  

In [16]:
rec_list, scores = get_recommendations(cleaned_talks[804],10, topic_names, ted_all,
                                       lda_mod, vect_mod, lda_data)


--- ID ---
 804
--- distance ---
 0.0
--- topic ---
politics
https://www.ted.com/talks/charles_limb_your_brain_on_improv

--- teds tags ---
['TEDx', 'brain', 'creativity', 'entertainment', 'music', 'science', 'technology']

--- ID ---
 1752
--- distance ---
 0.08891830375260179
--- topic ---
technology, privacy
https://www.ted.com/talks/nancy_kanwisher_the_brain_is_a_swiss_army_knife

--- teds tags ---
['brain', 'neuroscience', 'visualizations']

--- ID ---
 2122
--- distance ---
 0.09103100669662334
--- topic ---
politics
https://www.ted.com/talks/uri_hasson_this_is_your_brain_on_communication

--- teds tags ---
['brain', 'cognitive science', 'collaboration', 'communication', 'language', 'mind', 'neuroscience', 'science', 'speech']

--- ID ---
 1255
--- distance ---
 0.11806924992412071
--- topic ---
politics
https://www.ted.com/talks/sarah_jayne_blakemore_the_mysterious_workings_of_the_adolescent_brain

--- teds tags ---
['aging', 'biology', 'brain', 'children', 'cognitive science',