#### Load json files: 

In [2]:
import json
#from pprint import pprint

data_actor = json.load(open('actor.json'))
data_city = json.load(open('city.json'))
data_celestialbody = json.load(open('celestialbody.json'))
data_education = json.load(open('educationalInstitution.json'))
data_lake = json.load(open('lake.json'))

## Create Document:

### 1. Create list with abstracts

In [3]:
# Person(Actor)

person_names = []
person_abstracts = []

length_p = len(data_actor['results']['bindings'])

for i in range(length_p):
    name = data_actor['results']['bindings'][i]['name']['value']
    abstract = data_actor['results']['bindings'][i]['abstract']['value']
    person_names.append(name.split())
    person_abstracts.append(abstract)


# City
city_name = []
city_abstract = []

length_c = len(data_city['results']['bindings'])

for j in range(length_c):
    name = data_city['results']['bindings'][j]['name']['value']
    abstract =  data_city['results']['bindings'][j]['abstract']['value']
    city_name.append(name)
    city_abstract.append(abstract)
    
    
# CelestialBody
cb_name = []
cb_abstract = []

length_cb = len(data_celestialbody['results']['bindings'])

for k in range(length_cb):
    name = data_celestialbody['results']['bindings'][k]['name']['value']
    abstract =  data_celestialbody['results']['bindings'][k]['abstract']['value']
    cb_name.append(name)
    cb_abstract.append(abstract)
    
    
# EducationalInstitution
ei_name = []
ei_abstract = []

length_ei = len(data_education['results']['bindings'])

for l in range(length_ei):
    name = data_education['results']['bindings'][l]['name']['value']
    abstract =  data_education['results']['bindings'][l]['abstract']['value']
    ei_name.append(name)
    ei_abstract.append(abstract)
    
    
# Lake
lake_name = []
lake_abstract = []

length_lake = len(data_lake['results']['bindings'])

for m in range(length_lake):
    name = data_lake['results']['bindings'][m]['name']['value']
    abstract =  data_lake['results']['bindings'][m]['abstract']['value']
    lake_name.append(name)
    lake_abstract.append(abstract)

### 2. combine all lists

In [4]:
# Only need abstracts.
all_docs = person_abstracts + city_abstract + cb_abstract + ei_abstract + lake_abstract

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/eva/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/eva/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

## Preprocessing:

In [7]:
stopword = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemma = WordNetLemmatizer()

# Add all names as well as 'also' to stopwords, as they do not characterize a topic.
names = person_names + city_name + cb_name + ei_name + lake_name
all_names = [val for sublist in names for val in sublist]

stopword.add('also')
for name in all_names:
    stopword.add(name.lower())
    stopword.add('lee')
    

In [8]:
def clean(doc):
    punc_free = ''.join(ch for ch in doc.lower() if ch not in punctuation)
    stop_free = " ".join([word for word in punc_free.split() if word not in stopword])  
    normalized = " ".join(lemma.lemmatize(wordb) for wordb in stop_free.split())  
    return normalized

        
doc_clean = [clean(doc).split() for doc in all_docs]     

## Training:

In [73]:
# Importing Gensim
import gensim
from gensim import corpora

# Create the term dictionary of abstracts, every unique term is assigned to an index. 
dictionary = corpora.Dictionary(doc_clean)

# Convert list of abstracts into Document Term Matrix -> every word -> tupel (wird id, word frequency)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]  )

In [77]:
# Create LDA model
Lda = gensim.models.ldamodel.LdaModel

# Train the LDA model on tmatrix
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word = dictionary, passes=50)

## Results:

In [78]:
# print results
print(ldamodel.print_topics(num_topics=5, num_words=10))

[(0, '0.018*"province" + 0.013*"county" + 0.013*"norway" + 0.010*"lough" + 0.010*"district" + 0.009*"region" + 0.008*"city" + 0.007*"quechua" + 0.007*"qucha" + 0.007*"peru"'), (1, '0.013*"asteroid" + 0.009*"year" + 0.009*"galaxy" + 0.008*"constellation" + 0.007*"approximately" + 0.007*"discovered" + 0.007*"magnitude" + 0.006*"surface" + 0.005*"system" + 0.005*"named"'), (2, '0.016*"chinese" + 0.012*"film" + 0.009*"actor" + 0.009*"actress" + 0.008*"known" + 0.008*"name" + 0.006*"china" + 0.006*"role" + 0.006*"best" + 0.005*"japanese"'), (3, '0.058*"school" + 0.018*"high" + 0.017*"university" + 0.014*"college" + 0.012*"student" + 0.009*"located" + 0.008*"public" + 0.007*"education" + 0.006*"state" + 0.006*"year"'), (4, '0.015*"river" + 0.015*"located" + 0.015*"area" + 0.013*"reservoir" + 0.013*"water" + 0.011*"county" + 0.011*"city" + 0.009*"state" + 0.008*"km" + 0.008*"dam"')]
