In [1]:
import sys
sys.path.append('../')
import wrangle

import nlp

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)

df, data_dict = wrangle.wrangle_data()

In [4]:
big_df = pd.read_csv('topics.csv', index_col=False)

In [5]:
big_df.drop('Unnamed: 0', axis=1,inplace=True)

In [6]:
specs = big_df[big_df.persona_id == 2]

In [7]:
specs.shape

(123, 31)

# Specialists

## Sentiment

In [23]:
specs.big_answer.apply(nlp.find_polarity).mean()

0.22268643436292426

In [24]:
specs.big_answer.apply(nlp.find_subjectivity).mean()

0.4324475374544341

## Total Keywords

In [26]:
nlp.show_column_keywords(specs.big_answer, max_df=.8, stop_words=stopWords, ngram_range=(1,3))

['sort',
 'psychology',
 'professional',
 'depending',
 'connection',
 'method',
 'statistic',
 'training',
 'undergraduate',
 'graduate',
 'cost',
 'timing',
 'away',
 'time',
 'workshop',
 'small',
 'intimate',
 'international',
 'attend',
 'chooses']

## What topics would they be most attracted to at a conference about research?

q21 ideal_topics

In [22]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference', 'yes', 've', 'ha']

stopWords = nlp.set_stop_words(stop_words)

### keywords

In [10]:
#words
nlp.show_column_keywords(specs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['design',
 'innovative',
 'method',
 'nonresearchers',
 'cycle',
 'analysis',
 'session',
 'new',
 'methodology',
 'case',
 'study',
 'new methodology',
 'case study',
 'participant',
 'recruitment',
 'strategy',
 'data',
 'sharing',
 'practice',
 'managing']

In [11]:
#bi-grams
nlp.show_column_keywords(specs.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['new methodology',
 'case study',
 'qualitative data',
 'analysis method',
 'special consideration',
 'application method',
 'lesson learned',
 'new method',
 'qual quant',
 'service design',
 'new technique',
 'business case']

### topics

In [12]:
specs.ideal_topics_topic_id.value_counts()

case_study                  58
quant/qual/data             23
new_method, mixed_method    22
ops/ai                      20
Name: ideal_topics_topic_id, dtype: int64

## Who would they expect to see at a conference about research?

q22 Ideal Attendees

In [11]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference']

In [12]:
stopWords = nlp.set_stop_words(stop_words)

In [13]:
#terms
nlp.show_column_keywords(specs.ideal_attendees_text, max_df=.5, stop_words=stopWords)

['mix',
 'academic',
 'professional',
 'company',
 'researcher',
 'public',
 'policy',
 'leading',
 'known',
 'skill',
 'sam',
 'ladner',
 'sam ladner',
 'walk',
 'just',
 'world',
 'world just',
 'industry',
 'google',
 'apple']

In [14]:
#bigrams
nlp.show_column_keywords(specs.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['sam ladner',
 'world just',
 'different field',
 'steve portigal',
 'working different',
 'want hear',
 'indi young',
 'dana chisnell',
 'jared spool',
 'tech company',
 'case study',
 'different background',
 'erika hall',
 'thought leader',
 'jan chipchase']

In [16]:
specs.ideal_attendees_topic_id.value_counts()

industry, team, product    54
experience, jared spool    16
sam ladner, erika hall     15
indi young                  9
Name: ideal_attendees_topic_id, dtype: int64

## What advice do they have for the Rosenfeld Media team in pursuing a conference?
q23 recommendations

In [15]:
#words
nlp.show_column_keywords(specs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(1,7))

['dont',
 'create',
 'schedule',
 'event',
 'draw',
 'crowd',
 'looking',
 'feel',
 'biggest',
 'area',
 'unconference',
 'series',
 'unconferences',
 'workshop',
 'similar',
 'thing',
 've',
 'really',
 'cool',
 'design']

In [16]:
#ngrams
nlp.show_column_keywords(specs.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(2,7))

['north america',
 'america europe',
 'north america europe',
 'seriously think',
 'ux researcher',
 'advice talk',
 'networking event',
 'speed dating',
 'code conduct']

In [17]:
specs.recommendations_topic_id.value_counts()

speaker, industry      69
event, opportunity     16
group, career, city    15
good, know             14
field, survery          9
Name: recommendations_topic_id, dtype: int64

## Top Documents per Topic

In [18]:
doc_term_matrix, count_vect = nlp.create_wordcount_matrix(specs.recommendations_text, max_df=.3, ngram=(1,3), stop_words=stopWords)

LDA = LatentDirichletAllocation(n_components=4, random_state=42)

LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [19]:
recommendations_dict = {0 : 'speaker, industry', 
                        1 : 'event, opportunity', 
                        2 : 'good, know', 
                        3 : 'field, survery', 
                        4 : 'group, career, city'}

In [20]:
nlp.find_top_documents_per_topic(LDA.transform(doc_term_matrix), specs.recommendations_text, 5)

Top 5 Documents for Topic 0: 

Document 1
I love the idea of a 2 day conference. The first day being full day workshop with a micro focus. And day 2 full of talks in a traditional conference setting.

Document 2
I'd look for diversity in speakers and a code of conduct. I know it's not something I look for when shopping, but it is still important. 

Document 3
Always-available coffee is a MUST! Listening to even the most entertaining talks can get tiring when back to back. 

I hope you do put on an amazing conference, given all of the great feedback I'm sure you're getting. I also hope I can afford to come ;) Happy planning! 

Document 4
As above. Don't just give the usual motley Crew another platform to perform. Go and find new things and people to excite the industry. I've been to events recently where there were people there from sexy brands like Tesla who had really nothing to say. If only we could clone Steve P!

Document 5
I don't know anything about Rosenfeld Media, but---

I wou