# Task 2: LDA Topic Model
***

We create a simple, out-of-the-box Latent Dirichlet Allocation Topic Model with visualization.

In [1]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import pandas as pd
import numpy as np

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

  regargs, varargs, varkwargs, defaults = inspect.getargspec(func)


In [7]:
df = pd.read_csv('cleaned_jels.csv', encoding='utf-8-sig', low_memory=False)
df.drop('0', axis=1, inplace=True)
df

Unnamed: 0,Title,Abstract,Keywords,Jel Codes,Author1,Author2,Author3,Author4,Author5,Author6,...,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57
0,,,"[ , corporate social responsibility, , corpor...",,Дохолян Самвел Бахшиевич,,,,,,...,,,,,,,,,,
1,,,"[ , , , , financial stability, a balanced m...",,Могзоев Александр Мушкудаевич,Шадрина Ирина Николаевна,,,,,...,,,,,,,,,,
2,,,"[ , , , small business, state support, support]",,Руденко Людмила Геннадьевна,,,,,,...,,,,,,,,,,
3,,,"[ , , , , , , , , ]",,Суптело Наталья Петровна,Михайлов Павел Сергеевич,,,,,...,,,,,,,,,,
4,,,"[ , , , , , problems, statistical methodol...",,Тебекин Алексей Васильевич,Тебекин Павел Алексеевич,Кузнецова Галина Васильевна,,,,...,,,,,,,,,,
5,,,"[ , , , , , , capital repairs, an apartme...",,Клюев Виктор Дмитриевич,Зайцев Дмитрий Анатольевич,Евсикова Юлия Владимировна,,,,...,,,,,,,,,,
6,,35,"[ , , , , , communications, intraorganizat...",,Алексеев Александр Николаевич,,,,,,...,,,,,,,,,,
7,,,"[ , , urban agglomeration, municipal union]",,Богославец Даниил Михайлович,,,,,,...,,,,,,,,,,
8,,,"[ , , , , city, transport infrastructure, t...",,Зубец Антон Желькович,,,,,,...,,,,,,,,,,
9,,,"[ , , , , , , , , , authorized economi...",,Зыков Андрей Алексеевич,Котов Алексей Юрьевич,,,,,...,,,,,,,,,,


>We need to lemmatize all words to reduce dimensionality: jogging, jogged, jogs -> jog

In [8]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_tag(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    elif pos_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return None


class Lemmatizer_CountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(Lemmatizer_CountVectorizer, self).build_analyzer()
        return lambda doc:([lemmatizer.lemmatize(word[0], get_wordnet_tag(word[1])) for word in pos_tag(analyzer(doc)) 
                            if get_wordnet_tag(word[1]) is not None])

In [9]:
count_vectorizer = Lemmatizer_CountVectorizer(stop_words='english', lowercase=True, max_df=.5, min_df=10)
dtm_cv = count_vectorizer.fit_transform(df['Title'] + ' ' + df['Abstract'])

>We use a Latent Dirichlet Allocation Model. 

In [10]:
lda_cv = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_cv.fit(dtm_cv)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_jobs=1, n_topics=20, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [11]:
pyLDAvis.sklearn.prepare(lda_cv, dtm_cv, count_vectorizer)