# Task 2: LDA Topic Model
***

We create a simple, out-of-the-box Latent Dirichlet Allocation Topic Model with visualization.

In [15]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pandas as pd
import numpy as np

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [16]:
df = pd.read_csv('cleaned_jels.csv', encoding='utf-8-sig', low_memory=False)
df.drop('0', axis=1, inplace=True)
df

Unnamed: 0,Title,Publisher,Abstract,Keywords,Jel Codes,Author1,Author2,Author3,Author4,Author5,...,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80
0,technical efficiency of hungarian farms before...,european association of agricultural economists,hungary is one of the ten countries that have ...,"[agribusiness, production economics]",,"zoltán bakucs, lajos","fertő, imre","fogarasi, józsef",,,...,,,,,,,,,,
1,exploring the flexibility of polish family far...,european association of agricultural economists,no abstract is available for this item,"[farm management, productivity analysis]",,"pieniadz, agata","renner, swetlana","petrick, martin",,,...,,,,,,,,,,
2,regional clusters in a function of rural devel...,european association of agricultural economists,as a theoretical concept rural development bas...,[community rural urban development],,"sudarić, tihana","zmaić, krunoslav","petrač, božidar",,,...,,,,,,,,,,
3,the extended metropolitan area in a new member...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"zolin, m. bruna",,,,,...,,,,,,,,,,
4,infrastructural capacity of family owned holdi...,european association of agricultural economists,no abstract is available for this item,"[community rural urban development, land econo...",,"Živković, dragić","dimitrijević, bojan","jelić, sreten","rajić, zoran",,...,,,,,,,,,,
5,comparative research of food consumption in se...,european association of agricultural economists,no abstract is available for this item,[food consumption nutrition food safety],,"Мilanović, milan r.","Đorović, milutin","stevanović, simo",,,...,,,,,,,,,,
6,targeting agricultural and rural development m...,european association of agricultural economists,no abstract is available for this item,"[agricultural and food policy, community rural...",,"segré, andrea","rakić, renata","rokvić, gordana","vittuari, matteo",,...,,,,,,,,,,
7,needs assessment analysis of small rural house...,european association of agricultural economists,no abstract is available for this item,[consumer household economics],,"bogdanov, natalija","moslavac, nenad",,,,...,,,,,,,,,,
8,designing a rural development strategy for ser...,european association of agricultural economists,no abstract is available for this item,[community rural urban development],,"cochrane, nancy","reed, michael","jovanović, zlatko",,,...,,,,,,,,,,
9,rural development and the heritage of chayanov...,european association of agricultural economists,no abstract is available for this item,"[community rural urban development, labor and ...",,"nuppenau, ernst-august",,,,,...,,,,,,,,,,


>We need to lemmatize all words to reduce dimensionality: jogging, jogged, jogs -> jog

In [17]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_tag(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    elif pos_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return None


class Lemmatizer_CountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(Lemmatizer_CountVectorizer, self).build_analyzer()
        return lambda doc:([lemmatizer.lemmatize(word[0], get_wordnet_tag(word[1])) for word in pos_tag(analyzer(doc)) 
                            if get_wordnet_tag(word[1]) is not None])

In [18]:
count_vectorizer = Lemmatizer_CountVectorizer(stop_words='english', lowercase=True, max_df=.5, min_df=10)
dtm_cv = count_vectorizer.fit_transform(df['Abstract'])

>We use a Latent Dirichlet Allocation Model. 

In [19]:
lda_cv = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_cv.fit(dtm_cv)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [21]:
pyLDAvis.sklearn.prepare(lda_cv, dtm_cv, count_vectorizer)