In [1]:
import sys
sys.path.append('../')
import wrangle

import nlp

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)

df, data_dict = wrangle.wrangle_data()

In [4]:
big_df = pd.read_csv('topics.csv', index_col=False)

In [5]:
big_df.drop('Unnamed: 0', axis=1,inplace=True)

In [6]:
specs = big_df[big_df.persona_id == 2]

In [7]:
specs.shape

(123, 31)

# Moderately Experienced
## What topics would they be most attracted to at a conference about research?

q21 ideal_topics


In [8]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference', 'good', 'best', 'self', 'report']

In [9]:
stopWords = nlp.set_stop_words(stop_words)

### keywords

In [10]:
#words
nlp.show_column_keywords(specs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['design',
 'innovative',
 'method',
 'nonresearchers',
 'cycle',
 'analysis',
 'session',
 'new',
 'methodology',
 'case',
 'study',
 'new methodology',
 'case study',
 'participant',
 'recruitment',
 'strategy',
 'data',
 'sharing',
 'practice',
 'managing']

In [11]:
#bi-grams
nlp.show_column_keywords(specs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['new methodology',
 'case study',
 'qualitative data',
 'analysis method',
 'special consideration',
 'application method',
 'lesson learned',
 'new method',
 'qual quant',
 'service design',
 'new technique',
 'business case']

### topics

In [13]:
specs.ideal_topics_topic_id.value_counts()

case_study                  43
quant/qual/data             18
ops/ai                      17
new_method, mixed_method    16
Name: ideal_topics_topic_id, dtype: int64

## Who would they expect to see at a conference about research?

q22 Ideal Attendees

In [11]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference']

In [12]:
stopWords = nlp.set_stop_words(stop_words)

In [14]:
#terms
nlp.show_column_keywords(specs.ideal_attendees_text, max_df=.5, stop_words=stopWords)

['kate',
 'towsey',
 'indi',
 'young',
 'erika',
 'hall',
 'kate towsey',
 'indi young',
 'erika hall',
 'field',
 'experienced',
 'community',
 'group',
 'government',
 'diversity',
 'really',
 'innovation',
 'researcher',
 'steve',
 'portigal']

In [15]:
#bigrams
nlp.show_column_keywords(specs.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['kate towsey',
 'indi young',
 'erika hall',
 'steve portigal',
 'sam ladner',
 'google microsoft',
 'industry leader',
 'senior researcher',
 'tech company',
 'company small',
 'academic market',
 'market researcher',
 'academic market researcher',
 'different perspective']

In [16]:
specs.ideal_attendees_topic_id.value_counts()

industry, team, product    54
experience, jared spool    16
sam ladner, erika hall     15
indi young                  9
Name: ideal_attendees_topic_id, dtype: int64

## What advice do they have for the Rosenfeld Media team in pursuing a conference?
q23 recommendations

In [17]:
#words
nlp.show_column_keywords(specs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(1,7))

['host',
 'event',
 'comfortable',
 'fall',
 'attending',
 'bring',
 'everyday',
 'collective',
 'demand',
 'learn',
 've',
 'job',
 'community',
 'attendee',
 'experience',
 'speaker',
 'marginalized',
 'group',
 'previously',
 'focus']

In [18]:
#ngrams
nlp.show_column_keywords(specs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(2,7))

['attendee experience', 'level experience']

In [19]:
specs.recommendations_topic_id.value_counts()

speaker, industry      52
event, opportunity     19
field, survery          9
good, know              8
group, career, city     6
Name: recommendations_topic_id, dtype: int64

## Top Documents per Topic

In [20]:
doc_term_matrix, count_vect = nlp.create_wordcount_matrix(specs.recommendations_text, max_df=.3, ngram=(1,3), stop_words=stopWords)

LDA = LatentDirichletAllocation(n_components=4, random_state=42)

LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [None]:
recommendations_dict = {0 : 'speaker, industry', 
                        1 : 'event, opportunity', 
                        2 : 'good, know', 
                        3 : 'field, survery', 
                        4 : 'group, career, city'}

In [21]:
nlp.find_top_documents_per_topic(LDA.transform(doc_term_matrix), specs.recommendations_text, 5)

Top 5 Documents for Topic 0: 

Document 1
Don’t put similar sessions at the same time

Document 2
Conferences are really hard and most people lose money for the first several years. But can be a nice revenue stream once you get them going. I do think your name is strong enough to get you up the profitability curve more quickly than normal. // Also, and this may be controversial, I'd love to see a conference that weeds out complete newbies and consists only of attendees who are serious practitioners. No idea how to make that work, though.





Document 3
I guess ask yourselves if the world needs another conference, and if so, how you can include marginalized people that wouldn't normally be able to attend. Not everyone in this industry makes a ton of money or has an employer willing to throw down a couple thousand bucks to send them to these things. And not only that, but how to make those people feel welcome.

Document 4
It would be great to have a forum or database of previously asked