In [2]:
import sys
import numpy as np
import lda
import json
import pandas as pd
from collections import Counter, OrderedDict
import nltk
from nltk.corpus import stopwords
from itertools import *

from sklearn.feature_extraction.text import CountVectorizer

class topic_analysis(object):

	"""
	A class to extract topics and associate all the verbatims with a specific topics
	Input: dataframe, column_name with cleaned verbatims
	Output: verbatims & topics, top keywords
	"""

	dataframe = None
	vocab = None
	model = None
	vectorizer = None
	topics = []

	def __init__(self):
		pass

	def compute_lda_model(self,learning_set_lda,n_topics=10):
		"""
			Method to compute/generate LDA model
		"""

		# Initialize the "CountVectorizer" object
		vectorizer = CountVectorizer(analyzer = "word",   \
		                             tokenizer = None,    \
		                             preprocessor = None, \
		                             stop_words = None,   \
		                             max_features = 5000)

		train_data_features = vectorizer.fit_transform(learning_set_lda)
        
		self.vectorizer = vectorizer

		train_data_features = train_data_features.toarray()

		self.vocab = vectorizer.get_feature_names()

		X = train_data_features


		model = lda.LDA(n_topics, n_iter=1500, random_state=1) 
		model.fit(X) 

		return model

	def get_results(self,dataframe,column_name,n_topics):

		self.dataframe = dataframe
		self.topics = []
		weights = []

		#LDA Model calibration
		model = self.compute_lda_model(self.dataframe[column_name],n_topics)
        
		self.model = model

		#Add topics to data frame
		doc_topic = model.doc_topic_

		topics = []
		scores = []

		for i in doc_topic:
			topics.append(i.argmax())
			scores.append(i.max())

		self.dataframe['Topic_id'] = topics
		self.dataframe['Score'] = scores

		###### PRINT RESULTS ######

		topic_word = model.topic_word_ 
		n_top_words = 50 

		#####Store results in topics
		for i, topic_dist in enumerate(topic_word):
			#Get the topic
			topic_words = np.array(self.vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
			weights = ' '.join((topic_dist[np.argsort(topic_dist)][:-n_top_words:-1].astype('str')))
			words = ' '.join(topic_words)
			self.topics.append((i,words,weights))