In [1]:
import sys
sys.path.append('../')
import wrangle

import nlp

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)

df, data_dict = wrangle.wrangle_data()

In [4]:
big_df = pd.read_csv('topics.csv', index_col=False)

In [5]:
big_df.drop('Unnamed: 0', axis=1,inplace=True)

In [6]:
execs = big_df[big_df.persona_id == 1]

In [7]:
execs.shape

(94, 31)

# Moderately Experienced
## What topics would they be most attracted to at a conference about research?

q21 ideal_topics


In [8]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference', 'good', 'best', 'self', 'report']

In [9]:
stopWords = nlp.set_stop_words(stop_words)

### keywords

In [10]:
#words
nlp.show_column_keywords(execs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['value',
 'peer',
 'stakeholder',
 'topic',
 'study',
 'methodology',
 'analysis',
 'academic',
 'example',
 'understand',
 'method',
 'use',
 'mind',
 'think',
 'industry',
 'knowledge',
 'used',
 'way',
 'data',
 'insight']

In [11]:
#bi-grams
nlp.show_column_keywords(execs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['new method',
 'knowledge management',
 'technology ai',
 'new technique',
 'qual quant',
 'case study',
 'tool new']

### topics

In [12]:
execs.ideal_topics_topic_id.value_counts()

case_study                  36
ops/ai                      20
new_method, mixed_method    20
quant/qual/data             18
Name: ideal_topics_topic_id, dtype: int64

## Who would they expect to see at a conference about research?

q22 Ideal Attendees

In [11]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference']

In [12]:
stopWords = nlp.set_stop_words(stop_words)

In [13]:
#terms
nlp.show_column_keywords(execs.ideal_attendees_text, max_df=.5, stop_words=stopWords)

['norman',
 'specific',
 'colleague',
 'potential',
 'new',
 'client',
 'marketer',
 'design',
 'mix',
 'perspective',
 'network',
 'event',
 'speaker',
 'organization',
 'expert',
 'industry',
 'leader',
 'industry leader',
 'researcher',
 'want']

In [14]:
#bigrams
nlp.show_column_keywords(execs.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['industry leader',
 'working company',
 'indi young',
 'way working',
 'steve portigal',
 'really cool',
 'leader field',
 'qual quant',
 'thought leader']

In [15]:
execs.ideal_attendees_topic_id.value_counts()

industry, team, product    62
indi young                 16
experience, jared spool     9
sam ladner, erika hall      7
Name: ideal_attendees_topic_id, dtype: int64

## What advice do they have for the Rosenfeld Media team in pursuing a conference?
q23 recommendations

In [16]:
#words
nlp.show_column_keywords(execs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(1,7))

['london',
 'worked',
 'really',
 'way',
 'say',
 'dont',
 'learn',
 'speaking',
 'audience',
 'great',
 'author',
 'start',
 'attendee',
 'world',
 'style',
 'hard',
 'mix',
 'speaker',
 'time',
 'place']

In [17]:
#ngrams
nlp.show_column_keywords(execs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(2,7))

['afford attend',
 'healthy food',
 'time break',
 'limit speaker',
 'small focused']

In [18]:
execs.recommendations_topic_id.value_counts()

speaker, industry      46
event, opportunity     17
good, know             12
group, career, city    11
field, survery          8
Name: recommendations_topic_id, dtype: int64

## Top Documents per Topic

In [19]:
doc_term_matrix, count_vect = nlp.create_wordcount_matrix(execs.recommendations_text, max_df=.3, ngram=(1,3), stop_words=stopWords)

LDA = LatentDirichletAllocation(n_components=4, random_state=42)

LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [20]:
recommendations_dict = {0 : 'speaker, industry', 
                        1 : 'event, opportunity', 
                        2 : 'good, know', 
                        3 : 'field, survery', 
                        4 : 'group, career, city'}

In [21]:
nlp.find_top_documents_per_topic(LDA.transform(doc_term_matrix), execs.recommendations_text, 5)

Top 5 Documents for Topic 0: 

Document 1
Please, do it in Europe!
And adopt the EuroIA format: workshops in the morning and talks in the afternoon. Vertical days is also a good idea, in case the event is over 2 or 3 days.
Perhaps highlight sessions according to experience level - to avoid very basic concepts if you're experienced already.

Document 2
Please make sure you have experts in the field participate. As a senior practitioner, I find it very frustrating to attend a research track at a conference in which only the very basics of research are discussed. I don't think one wants the academic rigor and offputting nature of CHI, but it would be nice if we could avoid talks/discussions/workshops led by people who have read a few research books, had one job, and consider themselves experts. Research is a skill, and it would be great if we treated and celebrated it as such rather than as common sense work.

Document 3
Mix of outlook to new / future methods, e.g. in the form of lightnin