## Code to search Google and use Watson NLU to read info from web URL's
### First we create functions that allow us to call Watson API and create dataframes from supplied URL's
Python example for Watson found on:
https://github.com/watson-developer-cloud/python-sdk/blob/master/examples/natural_language_understanding_v1.py

Watson credentials from:
https://console.bluemix.net/services/natural-language-understanding/9db7a144-0f4b-4b21-beda-59440f9fcee3?paneId=credentials&env_id=ibm:yp:eu-gb&region=eu-gb

Helpful:
https://www.ibm.com/watson/developercloud/natural-language-understanding/api/v1/?python#post-analyze

https://natural-language-understanding-demo.ng.bluemix.net/

In [1]:
from __future__ import print_function
import json
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, EntitiesOptions, ConceptsOptions, KeywordsOptions, SentimentOptions,MetadataOptions
import pandas as pd

def website_info(link, limit=3, full_text=False):
    # uses watson NLU to process website information
    
    # first info that needs to be supplied for the API
    natural_language_understanding = NaturalLanguageUnderstandingV1(
        version='2017-02-27',
        username='x,
        password='x')
    
    # the actual output which is in the form of a json like object
    response = natural_language_understanding.analyze(
        url=link,
        features=Features(entities=EntitiesOptions(limit=limit), 
                          keywords=KeywordsOptions(limit=limit),
                         concepts=ConceptsOptions(limit=limit),
                         sentiment=SentimentOptions(), 
                          metadata=MetadataOptions()),
    return_analyzed_text= full_text)
    return response

def make_series(response):
    # create a pd.series object from the json format like text returned from website_info function
    title = response['metadata']['title']
    pub_date = response['metadata']['publication_date']
    concepts =[]
    concept_scores = []
    for concept in range(0,len(response['concepts'])):
        concepts.append(response['concepts'][concept]['text'])
        concept_scores.append(response['concepts'][concept]['relevance'])

    entities =[]
    entity_scores = []
    for entity in range(0,len(response['entities'])):
        entities.append(response['entities'][entity]['text'])
        entity_scores.append(response['entities'][entity]['relevance'])

    keywords =[]
    keyword_scores = []
    for kw in range(0,len(response['keywords'])):
        keywords.append(response['keywords'][kw]['text'])
        keyword_scores.append(response['keywords'][kw]['relevance'])

    sentiment = response['sentiment']['document']['label']
    sentiment_score =  response['sentiment']['document']['score']

    data= {'title': title,
           "pub_date": pub_date,
           "concepts": list(concepts),
          "entities": list(entities),
          "keywords": list(keywords),
          "sentiment": sentiment,
          "concept_scores": list(concept_scores),
          "entity_scores": list(entity_scores),
           "keyword_scores": list(keyword_scores),
          "sentiment_score": sentiment_score}
    return pd.Series(data)


def create_df(website_list, depth=3, full_text= False):
    # from list of websites creates a pandas dataframe
    dataframe= pd.DataFrame()
    for website in website_list:
        # Because some are giving errors (400 etc)
        try:
            response = website_info(website,depth)
            so= make_series(response)
            dataframe = pd.concat([dataframe,so], axis=1, ignore_index=True)
        except:
            number = website_list.index(website)
            print("Error: index ",number, " & ", website, "\n")
            dataframe = pd.concat([dataframe,pd.Series()], axis=1, ignore_index=True)
    return dataframe

### For testing:

In [135]:
url1 = 'https://en.wikipedia.org/wiki/3D_printing'
url2 = 'https://3dprinting.com/what-is-3d-printing/'
url3 = 'https://www.3dhubs.com/what-is-3d-printing'
url4 = 'https://newatlas.com/3d-printing/'
url5= 'http://explainingthefuture.com/3dprinting.html'
urls = [url1,url2, url3,url4,url5]

In [136]:
create_df(urls,3)

Unnamed: 0,0,1,2,3,4
concept_scores,"[0.948942, 0.911126, 0.838771]","[0.985215, 0.911869, 0.903174]","[0.980936, 0.960941, 0.928611]","[0.954263, 0.815574, 0.668667]","[0.982469, 0.952669, 0.938025]"
concepts,"[Rapid prototyping, Solid freeform fabrication...","[Fused deposition modeling, Selective laser si...","[Inkjet printer, Printing, 3D printing]","[Printing, Inkjet printer, Printmaking]","[Fused deposition modeling, Selective laser si..."
entities,"[Additive Manufacturing, 3D Systems Corporatio...","[Additive Manufacturing, 3D Systems, 3D Printi...","[SLS, 3D Systems]","[conductive hearing loss, WinSun Construction,...","[3D Systems, Stratasys]"
entity_scores,"[0.809033, 0.327032, 0.304848]","[0.931813, 0.659196, 0.635247]","[0.775535, 0.640176]","[0.775766, 0.709787, 0.704553]","[0.81396, 0.62321]"
keyword_scores,"[0.993946, 0.825792, 0.702506]","[0.943492, 0.823687, 0.734882]","[0.958601, 0.678466, 0.638627]","[0.966438, 0.676264, 0.662354]","[0.926713, 0.55906, 0.474407]"
keywords,"[3d printing, additive manufacturing, 3d printer]","[3d printing, 3d printer, additive manufacturing]","[3d printing, 3D printing technologies, 3d pri...","[3D printing, additive manufacturing, dominant...","[3d printing, 3D printer, 3D printing technolo..."
pub_date,2013-10-30T00:00:00,,,2018-01-22T00:00:00,
sentiment,positive,positive,positive,negative,positive
sentiment_score,0.182976,0.52198,0.557877,-0.0318873,0.465127
title,3D printing - Wikipedia,What is 3D printing? How does 3D printing work...,What is 3D Printing? The definitive guide,New Atlas,ExplainingTheFuture.com : 3D Printing


### Now using the above functions to provide watson statistics
#### Info for the google search package can be found on:
https://github.com/abenassi/Google-Search-API

In [28]:
import os
os.chdir("C:/Users/Ruben.Sikkes/Desktop/Google-Search-API-master")
from google2 import google

In [44]:
search_results = google.search("winter olympics", 2)

In [45]:
search_results[1]

GoogleResult(name=Olympische Winterspelen 2018 - Wikipedia
             description=... Josh Martin, South Korea likely to lose more ..

In [46]:
url_list = [search_results[x].link for x in range(0, len(search_results))]

In [47]:
url_list

['https://en.wikipedia.org/wiki/Winter_Olympic_Games',
 'https://nl.wikipedia.org/wiki/Olympische_Winterspelen_2018',
 'https://en.wikipedia.org/wiki/2018_Winter_Olympics',
 'https://www.olympic.org/winter-games',
 'https://www.pyeongchang2018.com/',
 'https://www.pyeongchang2018.com/en/schedule',
 'http://www.bbc.co.uk/sport/winter-olympics',
 'https://www.theguardian.com/sport/live/2018/feb/20/winter-olympics-2018-day-11-from-pyeongchang-live',
 'https://www.theguardian.com/sport/winter-olympics-2018',
 'https://www.cnn.com/2018/02/18/sport/olympics-2018-live-results/index.html',
 'http://www.bbc.com/sport/live/winter-olympics/42496749',
 'http://www.independent.co.uk/sport/olympics/winter-olympics/winter-olympics-2018-pyeongchang-elizabeth-swaney-ski-halfpipe-video-average-a8218991.html',
 'https://www.nytimes.com/news-event/winter-olympics-2018',
 'http://www.telegraph.co.uk/winter-olympics/2018/02/20/elise-christie-winter-olympics-1000m-heats-live-updates/',
 'https://www.cbssport

In [48]:
result_frame = create_df(url_list,3) # 3 concepts etc 

Error: index  1  &  https://nl.wikipedia.org/wiki/Olympische_Winterspelen_2018 



In [49]:
result_frame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
concept_scores,"[0.912758, 0.663091, 0.560763]",,"[0.934213, 0.783677, 0.738847]","[0.885315, 0.521781, 0.519242]",[],[],"[0.867021, 0.818339, 0.753886]","[0.96275, 0.82593, 0.620334]","[0.882585, 0.853388, 0.79672]","[0.91136, 0.8987, 0.82944]","[0.939911, 0.765394, 0.693791]","[0.966634, 0.872101, 0.776912]","[0.977188, 0.893517, 0.887105]","[0.980961, 0.969235, 0.892837]","[0.964283, 0.945784, 0.767023]","[0.96352, 0.910208, 0.739116]","[0.961522, 0.841134, 0.708976]",[],"[0.944678, 0.817737, 0.779662]","[0.976509, 0.80136, 0.632459]"
concepts,"[Winter Olympic Games, Olympic Games, Summer O...",,"[Winter Olympic Games, Olympic Games, Summer O...","[Winter Olympic Games, Olympic Games, Summer O...",[],[],"[Ice skating, Skating, Olympic sports]","[Figure skating, Ice dancing, Gold medal]","[Canada, Snowboarding, Monday]","[World, Instant messaging, Earth]","[Olympic sports, Ancient Olympic Games, Olympi...","[Winter Olympic Games, Skiing, 2014 Winter Oly...","[Olympic Games, United States, Winter Olympic ...","[Summer Olympic Games, United States, Olympic ...","[Ice hockey, United States, Developed country]","[Olympic Games, Summer Olympic Games, Winter O...","[Igor Shpilband, 2008 World Figure Skating Cha...",[],"[Winter Olympic Games, Olympic Games, 2014 Win...","[Figure skating, World Figure Skating Champion..."
entities,"[International Olympic Committee, Olympics]",,"[Olympics, International Olympic Committee]",[Innsbruck],[],[],"[Pyeongchang, Elise Christie, 1,000m]","[Elise Christie, France, Germany]","[Elizabeth Swaney, USA]",[Facebook],"[Mica McNeill, Sochi, Elana Meyers Taylor]","[Elizabeth Swaney, Olympics]","[gold medal, United States, Olympics]","[Elise Christie, Olympics, Beijing]","[United States, Brita Sigourney, Tessa Virtue]","[Elise Christie, BBC Sport, Olympics]","[Scott Moir, United States, Gangeung Ice Arena]",[],"[Australia, gold medal]","[Gabriella Papadakis, Olympics, Scott Moir]"
entity_scores,"[0.840624, 0.67898]",,"[0.890542, 0.651313]",[0.511656],[],[],"[0.33, 0.33, 0.33]","[0.753063, 0.548814, 0.485924]","[0.981051, 0.431312]",[0.92192],"[0.948166, 0.903067, 0.777128]","[0.839485, 0.364885]","[0.799254, 0.461324, 0.45749]","[0.889031, 0.795949, 0.55237]","[0.953844, 0.704798, 0.63382]","[0.876983, 0.290827, 0.286331]","[0.741883, 0.632102, 0.548932]",[],"[0.931816, 0.748618]","[0.845735, 0.476215, 0.378737]"
keyword_scores,"[0.9689, 0.930526, 0.658011]",,"[0.951441, 0.798144, 0.795828]","[0.982951, 0.783055, 0.748441]","[0.985785, 0.760906]","[0.985785, 0.760906]","[0.96546, 0.864712, 0.817763]","[0.903716, 0.776348, 0.755389]","[0.95309, 0.840775, 0.769759]","[0.97042, 0.510723]","[0.905057, 0.887805, 0.875952]","[0.904368, 0.826306, 0.654188]","[0.982564, 0.861283, 0.679217]","[0.991329, 0.808939, 0.785873]","[0.96812, 0.95588, 0.928995]","[0.92099, 0.920157, 0.914478]","[0.936027, 0.926233, 0.900533]","[0.953954, 0.89182, 0.697687]","[0.996129, 0.941461, 0.873105]","[0.998134, 0.839016]"
keywords,"[International Olympic Committee, winter olymp...",,"[Winter Olympics, Olympic Games, South Korea]","[olympic games, Summer Olympic Games, salt lak...","[permission, server]","[permission, server]","[Elise Christie, live coverage, final event]","[short track, Elise Christie, East German cross]","[brilliant final performance, Elizabeth Swaney...","[Facebook Messenger, world]","[Elise Christie crash, Elana Meyers Taylor, du...","[World Cup, Swaney, Winter Olympics]","[gold medal, United States, Marcel Hirscher]","[yellow card, Elise Christie, short track rules]","[Canadians Tessa Virtue, United States men, Mo...","[yellow card, speed skating competition, Polis...","[partners Tessa Virtue, Olympic ice dancing, G...","[permission, Access, Reference]","[Winter Olympic medal, medal tally, snowboard ...","[French ice dancer, wardrobe malfunction]"
pub_date,2014-02-17T00:00:00,,2018-02-09T00:00:00,2018-01-11T00:00:00,,,2018-02-01T00:00:00,2018-02-20T00:00:00,2018-02-20T00:00:00,2018-02-18T00:00:00,2018-02-20T00:00:00,2018-02-20T00:00:00,1970-08-22T13:35:18,2018-02-20T00:00:00,,2018-02-20T00:00:00,2018-02-20T00:00:00,,2018-02-20T00:00:00,2018-02-19T00:00:00
sentiment,negative,,negative,positive,negative,negative,neutral,negative,positive,positive,positive,positive,positive,negative,negative,negative,positive,negative,positive,positive
sentiment_score,-0.135278,,-0.302385,0.448928,-0.617792,-0.617792,0,-0.332743,0.625964,0.591439,0.255634,0.0252721,5.62693e-05,-0.28363,-0.564838,-0.205281,0.187426,-0.686658,0.251813,0.133842
title,Winter Olympic Games - Wikipedia,,2018 Winter Olympics - Wikipedia,Winter Olympics - Past and Future Summer Olymp...,Access Denied,Access Denied,Winter Olympics - BBC Sport,Winter Olympics 2018: all the latest from day ...,Winter Olympics 2018 | Sport | The Guardian,Winter Olympics day 10: Results and live updates,"Watch Winter Olympics - Christie reaction, bob...",Elizabeth Swaney becomes overnight hero by bei...,Winter Olympics 2018,Elise Christie suffers another Winter Olympics...,Winter Olympics 2018 Monday highlights: USA ho...,Fresh Winter Olympics heartbreak for Elise Chr...,Winter Olympics 2018: Who Took Home a Pairs' I...,Access Denied,Young team of Aussies on the rise at Winter Ol...,French ice dance team rebounds from wardrobe m...


In [50]:
result_frame.loc['entities'][4]

[]

In [51]:
word_list = list(result_frame.loc['entities'])
from itertools import chain
cleanedList = [x for x in word_list if str(x) != 'nan']
cleanedList= list(chain.from_iterable(cleanedList))
[[x,cleanedList.count(x)] for x in set(cleanedList)]

[['Olympics', 7],
 ['Elizabeth Swaney', 2],
 ['Elise Christie', 3],
 ['gold medal', 2],
 ['Facebook', 1],
 ['Scott Moir', 2],
 ['Gangeung Ice Arena', 1],
 ['Brita Sigourney', 1],
 ['International Olympic Committee', 2],
 ['Sochi', 1],
 ['USA', 1],
 ['Elana Meyers Taylor', 1],
 ['France', 1],
 ['United States', 3],
 ['Tessa Virtue', 1],
 ['Mica McNeill', 1],
 ['1,000m', 1],
 ['BBC Sport', 1],
 ['Australia', 1],
 ['Germany', 1],
 ['Pyeongchang', 1],
 ['Beijing', 1],
 ['Gabriella Papadakis', 1],
 ['Innsbruck', 1],
 ['Elise\xa0Christie', 1]]

### Now we can try and sort entities/concepts/keywords based on occurance and relevancy strength

In [52]:
# Create a list of lists for names and scores and remove 'nan' columns /values
def get_importance(result_frame, name,value):
    # Input should be dataframe with the two strings: a: entities, keywords, concepts and correspondingly b; entity_scores,
    # concept_scores or keyword_scores
    name_list = list(result_frame.loc[name])
    name_list = [x for x in name_list if str(x) != 'nan']
    name_list= list(chain.from_iterable(name_list))
    score_list =  list(result_frame.loc[value])
    score_list =  [x for x in score_list if str(x) != 'nan']
    score_list = list(chain.from_iterable(score_list))

    # Zip this to a tuple format
    name_info = []
    for name,score in zip(name_list, score_list):
        name_info.append((name,score))

    # Create a dictionary containing new values (summed relavances)
    name_dictionary = {}
    for key, value in name_info:
        if key not in name_dictionary:
             name_dictionary[key] = value
        else:
            name_dictionary[key]= name_dictionary[key] + value
            
    occurances= dict([[x,name_list.count(x)] for x in set(name_list)])# if not only summed but also a count should be added
    
    result= pd.DataFrame({'occurances': pd.Series(occurances),
              'relevance': pd.Series(name_dictionary)}).sort_values(['occurances', 'relevance'], ascending = [0,0])
    
    # Sort the dict values descending
    return  result
def get_url_from_word(result_frame,url_list, feature, word):
    # Needs as input the dataframe containing all the info, an url list (list of strings), feature we are looking for (concepts, keywords or entities) 
    # and the word we want to search i.e. "3d printing", both as strings
    # first find columns which contain this word
    results = result_frame.loc[feature].apply(lambda x: 1 if str(x) != 'nan' and word in x else 0)
    # use these masks to return urls from original list
    msk= (results==1).values
    return_list = [url_list[i] for i in range(len(url_list)) if msk[i]]
    
    return return_list

In [53]:
get_importance(result_frame, "concepts", "concept_scores")[1:10]

Unnamed: 0,occurances,relevance
Olympic Games,8,6.313622
Summer Olympic Games,5,3.710021
United States,3,2.808536
Figure skating,2,1.939259
Olympic sports,2,1.693797
2014 Winter Olympics,2,1.556574
Ice hockey,1,0.964283
Igor Shpilband,1,0.961522
World,1,0.91136


#### Find the url connected to the concept that we are interested in

In [54]:
get_url_from_word(result_frame,url_list, 'concepts', 'Igor Shpilband')

['http://tvline.com/2018/02/19/olympics-2018-pairs-ice-dancing-medal-results/']

### Use some standard language processing
#### What if the language on the site is Dutch?

In [55]:
search_results = google.search("toekomstplannen haven rotterdam", 2)
search_results[1]

GoogleResult(name=Havenvisie 2030 | Haven van Rotterdam
             description=De Havenvisie 2030 geeft de ambitie aan voor de t..

In [56]:
url_list = [search_results[x].link for x in range(0, len(search_results))]
url_list

['https://www.portofrotterdam.com/nl/havenkrant/havenkrant-editie-29/haven-van-de-toekomst-5-voorbeelden-en-uitdagingen',
 'https://www.portofrotterdam.com/nl/de-haven/havenvisie-2030',
 'https://www.maasvlakte2.com/nl/index/show/id/686/havenvisie-2030',
 'https://tps-levert.nl/blog/de-toekomst-van-de-rotterdamse-haven-trilogie-deel-3',
 'https://www.rijnmond.nl/nieuws/149514/Toekomst-Rotterdamse-haven-niet-de-grootste-maar-de-slimste',
 'https://www.ad.nl/economie/haven-rotterdam-is-binnen-dertig-jaar-uitgegroeid~a6c704a5/',
 'https://www.ad.nl/rotterdam/rotterdamse-haven-krijgt-fabriek-van-de-toekomst~af7c006f/',
 'https://www.rotterdam.nl/wonen-leven/stadsvisie/',
 'https://www.nrc.nl/nieuws/2017/04/26/afbraak-van-de-fossiele-haven-gaat-razendsnel-8434160-a1556221',
 'http://www.rotterdamviertdestad.nl/manifestatie/de-diepte-in/rotterdam-2040-een-blik-in-de-toekomst/',
 'https://www.scholieren.com/profielwerkstuk/20721',
 'https://bouwenuitvoering.nl/politiek_en_economie/grote-toeko

#### Now we will need to work with the raw url text
For this, we can again use the watson crawler defined in the beginning. This time however with a slight twist

In [58]:
text_frame = website_info(url_list[1],full_text=True)

In [74]:
text_frame.keys()



In [82]:
text_frame.get('metadata', None)

{'authors': [{'name': 'Port Health Authority'}],
 'feeds': [{'link': 'https://www.portofrotterdam.com/nl/de-haven/nieuws-evenementen/rss'}],
 'image': 'https://www.portofrotterdam.com/sites/default/files/styles/por_is_column_visual/public/horizon-ships.jpg?itok=kg2QLnvO',
 'publication_date': '2015-06-09T00:00:00',
 'title': 'Havenvisie 2030'}

In [101]:
def make_dutch_series(response):
    # create a pd.series object from the json format like text returned from website_info function
    title = response['metadata']['title']
    pub_date = response['metadata']['publication_date']
    text= response['analyzed_text']
    data= {'title': title,
           "pub_date": pub_date,
           "text": text
          }
    return pd.Series(data)

def dutch_df(website_list, depth=3, full_text= False):
    # from list of websites creates a pandas dataframe
    dataframe= pd.DataFrame()
    for website in website_list:
        # Because some are giving errors (400 etc)
        try:
            response = website_info(website,depth, full_text=True)
            so= make_dutch_series(response)
            dataframe = pd.concat([dataframe,so], axis=1, ignore_index=True)
        except:
            number = website_list.index(website)
            print("Error: index ",number, " & ", website, "\n")
            dataframe = pd.concat([dataframe,pd.Series()], axis=1, ignore_index=True)
    return dataframe


        

In [103]:
dutch_frame = dutch_df(url_list)

Error: index  12  &  https://www.rotterdammakersdistrict.com/stadshavens/ 



In [104]:
dutch_frame

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
pub_date,2016-06-21T00:00:00,2015-06-09T00:00:00,2011-11-19T00:00:00,,2016-12-08T22:07:00,,,2018-01-02T00:00:00,2017-04-26T00:00:00,,,2016-07-06T00:00:00,,2017-04-12T00:00:00,,2017-03-16T00:00:00,,2008-09-01T00:00:00,2011-04-22T00:00:00
text,Haven van de Toekom...,Havenvisie 2030 | H...,Havenvisie 2030\n ...,De toekomst van de Rotterdamse haven (trilogie...,\t \t \t\t \t\t \t\tToekomst Rotterdamse...,\tCookies op AD.nl | AD.nl\n \t \t \t \t \t...,\tCookies op AD.nl | AD.nl\n \t \t \t \t \t...,Rotterdam.nl | Stadsvisie\n ...,\t \t\t \t\t \t\t \t\t‘Afbraak van de fossi...,\tRotterdam 2040: een blik in de toekomst ...,\tProfielwerkstuk Economie De Rotterdams...,\t \tGrote toekomstplannen Rot...,,Containeroversla...,'Het einde nadert voor Rotterd...,\t \t \tAmbitie haven Amsterdam: kolenvrij i...,\t \t \t \tWAAROM ROTTERDAM VRIENDE...,Tweede Maasvlakte - Wikipedia\n ...,Vroege Vogels...
title,Haven van de Toekomst,Havenvisie 2030,Havenvisie 2030,De toekomst van de Rotterdamse haven (trilogie...,Toekomst Rotterdamse haven: niet de grootste m...,Cookies op AD.nl | AD.nl,Cookies op AD.nl | AD.nl,Stadsvisie | Rotterdam.nl,‘Afbraak van de fossiele haven gaat razendsnel’,Rotterdam 2040: een blik in de toekomst,Profielwerkstuk Economie De Rotterdamse haven,Grote toekomstplannen Rotterdam en Den Haag - ...,,Containeroverslag Rotterdam groeit sterk | Sch...,'Het einde nadert voor Rotterdam Mainport',Ambitie haven Amsterdam: kolenvrij in 2030 en ...,WAAROM ROTTERDAM VRIENDEN WIL BLIJVEN MET RUSL...,Tweede Maasvlakte - Wikipedia,Vroege Vogels - Natuur en Milieu in Nederland


#### So now we have a dataframe with text, titles and publication dates.
How can we get topic information from retrieved documents? 

In [132]:
stopwoorden = ""

ModuleNotFoundError: No module named 'pytextrank'

In [None]:
stopwoorden =" ‘t a aan aangaande aangezien
achter
achterna
aen
af"

In [111]:
text1= dutch_frame.ix['text'][1]

In [112]:
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import pickle
from nltk.corpus import wordnet as wn
nltk.download('punkt')

vect = CountVectorizer(min_df=20, max_df=0.2, stop_words=['de', 'het', 'een'], 
                       token_pattern='(?u)\\b\\w\\w\\w+\\b')
# Fit and transform
tokens = nltk.word_tokenize(text1)

X = vect.fit_transform(tokens)

# Convert sparse matrix to gensim corpus.
corpus = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

id_map = dict((v, k) for k, v in vect.vocabulary_.items())




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ruben.Sikkes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [129]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, id2word=id_map, num_topics=5, passes=25, random_state=34)


In [130]:
ldamodel.show_topics()

[(0, '0.982*"2030" + 0.018*"havenvisie"'),
 (1, '0.505*"havenvisie" + 0.495*"2030"'),
 (2, '0.510*"havenvisie" + 0.490*"2030"'),
 (3, '0.980*"2030" + 0.020*"havenvisie"'),
 (4, '0.992*"havenvisie" + 0.008*"2030"')]

In [128]:
X

<758x2 sparse matrix of type '<class 'numpy.int64'>'
	with 46 stored elements in Compressed Sparse Row format>