In [1]:
import sys
sys.path.append('../')
import wrangle
import explore
import nlp

In [2]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np

In [3]:
def find_word_counts(input_column, max_df=.3, min_df=2, ngram_range=(1,3), stop_words='english'):
    input_column = input_column.dropna().apply(nlp.basic_clean)
    input_column = input_column.apply(nlp.lemmatize)
    cv = CountVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words, ngram_range=ngram_range)   
    cv_fit=cv.fit_transform(input_column)    
    word_list = cv.get_feature_names()    
    count_list = cv_fit.toarray().sum(axis=0)
    word_counts = {'word_list': word_list, 'count_list': count_list}
    df_word_count = pd.DataFrame(data=word_counts)
    return df_word_count

In [4]:
pd.set_option('display.max_columns', None)

df, data_dict = wrangle.wrangle_data()

In [5]:
df = explore.add_target_to_df(df)

targetB = df.targetb

targetB = targetB.reset_index().drop('resp_id',axis=1)

In [6]:
big_df = pd.read_csv('topics.csv', index_col=False)

big_df.drop('Unnamed: 0', axis=1,inplace=True)

big_df['targetB'] = targetB

likely = big_df[big_df.targetB == 0]

In [7]:
likely.shape

(354, 32)

In [8]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference', 'yes', 've', 'ha', '300']

stopWords = nlp.set_stop_words(stop_words)

# Respondents NOT Likely To Go

## Sentiment

In [9]:
likely.big_answer.apply(nlp.find_polarity).mean()

0.21702682130075124

In [10]:
likely.big_answer.apply(nlp.find_subjectivity).mean()

0.4313958863792966

## TOTAL KEYWORDS

In [11]:
nlp.show_column_keywords(likely.big_answer, max_df=.8, stop_words=stopWords, ngram_range=(1,3))

['researchi',
 'phd',
 'library',
 'information',
 'science',
 'teaching',
 'involved',
 'ux',
 'architecture',
 'interned',
 'local',
 'firm',
 'doe',
 'additionally',
 'discipline',
 'increasingly',
 'data',
 'getting',
 'familiar',
 'self']

In [34]:
big_answer_df = find_word_counts(likely.big_answer, max_df=.2, stop_words=stopWords, ngram_range=(2,3))
big_answer_df.sort_values(by='count_list').tail(20)

Unnamed: 0,word_list,count_list
82,attendee experience timenananana,16
1286,wa great,16
990,service design,17
551,human factor,18
1193,topic covered,18
228,conduct researchmasters,19
1117,taught conduct researchmasters,19
19,500 attendeesmultitrack attendee,23
18,500 attendeesmultitrack,23
155,case study,32


## What does your company do?

q5 primary industry

### keywords

In [23]:
nlp.show_column_keywords(likely.prim_ind_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['higher',
 'education',
 'higher education',
 'food',
 'financial',
 'banking',
 'health',
 'healthcare',
 'tech',
 'ux',
 'consulting',
 'ux consulting',
 'technology',
 'consultant',
 'company',
 'industry',
 'design',
 'cpg',
 'independent',
 'freelancer']

In [13]:
#words
prim_ind_df = find_word_counts(likely.prim_ind_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
prim_ind_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
39,government,16
35,fintech,18
8,consulting,19
33,financial,19
76,software,21
79,tech,22
16,design,23
80,technology,24
42,healthcare,25
74,service,27


In [22]:
nlp.show_column_keywords(likely.prim_ind_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['higher education',
 'ux consulting',
 'financial service',
 'consulting industry',
 'management consulting',
 'software design',
 'enterprise software',
 'information technology',
 'customer experience',
 'software service',
 'public sector',
 'product design',
 'estate tech',
 'digital product',
 'saas software',
 'fashion retail']

In [14]:
#ngrams
prim_ind_df = find_word_counts(likely.prim_ind_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
prim_ind_df.sort_values(by='count_list').tail(5)

Unnamed: 0,word_list,count_list
7,higher education,3
10,product design,3
14,software service,3
3,enterprise software,4
6,financial service,15


In [None]:
likely[likely.prim_ind_text.notnull()].prim_ind_topic_id.value_counts()

## What kind of research are you doing?

q7 future_res

### keywords

In [19]:
nlp.show_column_keywords(likely.future_res_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['think',
 'focused',
 'social',
 'user',
 'usability',
 'study',
 'ethnographic',
 'customer',
 'interview',
 'user usability',
 'usability study',
 'service',
 'design',
 'thinking',
 'learning',
 'quant',
 'method',
 'qual',
 'analysis',
 'service design']

In [15]:
#words
future_res_text_df = find_word_counts(likely.future_res_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
future_res_text_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
225,unmoderated,18
233,usability testing,21
42,diary,21
164,quantitative,22
237,user,22
35,data,25
122,method,26
230,usability,34
192,study,42
203,testing,56


In [18]:
nlp.show_column_keywords(likely.future_res_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['user usability',
 'usability study',
 'service design',
 'quant method',
 'card sorting',
 'unmoderated usability',
 'usability testing',
 'unmoderated usability testing',
 'study survey',
 'unmoderated usability study',
 'data science',
 'looking new',
 'new way',
 'looking new way',
 'testing card',
 'competitive analysis',
 'testing card sorting',
 'field study',
 'diary study',
 'multivariate testing']

In [24]:
#ngrams
future_res_text_df = find_word_counts(likely.future_res_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
future_res_text_df.sort_values(by='count_list').tail(5)

Unnamed: 0,word_list,count_list
59,unmoderated usability,7
0,ab testing,9
7,contextual inquiry,9
12,diary study,18
64,usability testing,21


### topics

In [31]:
likely[likely.future_res_text.notnull()].future_res_topic_id.value_counts()

evaluative, quantitative, qualitative    76
focus group                              57
market research                          41
card sort                                36
moderate, unmoderate                     29
misc                                     26
journey mapping                          21
inquiry                                  21
Name: future_res_topic_id, dtype: int64

## What kind of research are you doing?

q6 types_res_used

### keywords

In [17]:
nlp.show_column_keywords(likely.types_res_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['conduct',
 'variety',
 'mix',
 'qualitative',
 'quantitative',
 'lot',
 'online',
 'mix qualitative',
 'qualitative quantitative',
 'mix qualitative quantitative',
 'interview',
 'workshop',
 'brainstorming',
 'market',
 'interview workshop',
 'ethnography',
 'mixed',
 'method',
 'qual',
 'quant']

In [25]:
#words
types_res_df = find_word_counts(likely.types_res_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
types_res_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
66,contextual,43
450,test,52
408,study,61
336,qualitative,73
548,user,74
529,usability testing,93
424,survey,132
516,usability,156
460,testing,163
200,interview,166


In [16]:
nlp.show_column_keywords(likely.types_res_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['mix qualitative',
 'qualitative quantitative',
 'mix qualitative quantitative',
 'interview workshop',
 'mixed method',
 'qual quant',
 'quant survey',
 'survey focus',
 'focus group',
 'design thinking',
 'service design',
 'design methodology',
 'grounded theory',
 'theory analysis',
 'survey focus group',
 'grounded theory analysis',
 'product usability',
 'usability testing',
 'testing contextual',
 'contextual inquiry']

In [26]:
#ngrams
types_res_df = find_word_counts(likely.types_res_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
types_res_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
278,testing survey,18
129,interview usability,18
125,interview survey,18
74,focus group,19
309,usability test,20
329,user interview,20
17,card sort,23
35,contextual inquiry,26
53,diary study,26
313,usability testing,93


### topics

In [35]:
likely[likely.types_res_text.notnull()].types_res_topic_id.value_counts()

focus group                62
ngram                      60
validation                 49
market                     47
qual/quant                 42
testing                    39
generative/mixed method    26
misc                       23
Name: types_res_topic_id, dtype: int64

## What topics would they be most attracted to at a conference about research?

q21 ideal_topics

In [None]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference', 'good', 'best', 'self', 'report']

In [None]:
stopWords = nlp.set_stop_words(stop_words)

### keywords

In [61]:
nlp.show_column_keywords(likely.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))

['design',
 'innovative',
 'method',
 'communicating',
 'working',
 'nonresearchers',
 'cycle',
 'innovative method',
 'method communicating',
 'participatory',
 'analysis',
 'design participatory',
 'session',
 'new',
 'methodology',
 'case',
 'study',
 'new methodology',
 'case study',
 'participant']

In [27]:
#words
ideal_topics_df = find_word_counts(likely.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
ideal_topics_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
378,researcher,25
71,case study,27
503,way,27
488,ux,29
70,case,32
430,study,34
445,team,34
111,data,34
296,new,36
281,method,66


In [62]:
nlp.show_column_keywords(likely.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['innovative method',
 'method communicating',
 'design participatory',
 'new methodology',
 'case study',
 'participant recruitment',
 'recruitment strategy',
 'participant recruitment strategy',
 'best practice',
 'space solution',
 'theory practice',
 'sell qualitative',
 'qualitative quantitative',
 'qualitative data',
 'want hear',
 'creative approach',
 'conducting analyzing',
 'cutting edge',
 'application method',
 'data science']

In [28]:
#bi-grams
ideal_topics_df = find_word_counts(likely.ideal_topics_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
ideal_topics_df.sort_values(by='count_list').tail(5)

Unnamed: 0,word_list,count_list
79,stakeholder management,4
4,analysis method,4
9,best practice,7
49,new method,11
14,case study,27


### topics

In [38]:
likely[likely.ideal_topics_text.notnull()].ideal_topics_topic_id.value_counts()

quant/qual/data             77
case_study                  73
new_method, mixed_method    71
ops/ai                      63
Name: ideal_topics_topic_id, dtype: int64

## Who would they expect to see at a conference about research?

q22 Ideal Attendees

In [None]:
stop_words = ['like', 'plus', 'real', 'love', 'big', 'avoiding', 'mean', 'content', 'people', 'problem', 
              'doing', 'using','research', 'work', 'don', 'make', 'conference']

stopWords = nlp.set_stop_words(stop_words)

In [63]:
nlp.show_column_keywords(likely.ideal_attendees_text, max_df=.5, stop_words=stopWords)

['company',
 'academic',
 'researcher',
 'public',
 'maker',
 'academic researcher',
 'leading',
 'known',
 'skill',
 'set',
 'expertise',
 'good',
 'sam',
 'ladner',
 'sam ladner',
 'open',
 'woman',
 'minority',
 'walk',
 'just']

In [29]:
#terms
ideal_attendees_df = find_word_counts(likely.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
ideal_attendees_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
328,tech,14
254,product,15
3,academic,15
108,field,17
359,want,18
148,industry,24
236,organization,25
50,company,27
184,leader,30
272,researcher,45


In [64]:
nlp.show_column_keywords(likely.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))

['academic researcher',
 'sam ladner',
 'different field',
 'civic tech',
 'steve portigal',
 'working company',
 'kim goodwin',
 'laura klein',
 'way working',
 'industry folk',
 'social scientist',
 'industry thought',
 'thought leader',
 'natalie hanson',
 'industry leader',
 'jared spool',
 'spool steve',
 'google facebook',
 'jared spool steve',
 'spool steve portigal']

In [30]:
#bigrams
ideal_attendees_df = find_word_counts(likely.ideal_attendees_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
ideal_attendees_df.sort_values(by='count_list').tail(5)

Unnamed: 0,word_list,count_list
47,steve portigal,6
17,industry leader,6
19,jan chipchase,6
39,sam ladner,7
9,erika hall,8


In [41]:
likely[likely.ideal_attendees_text.notnull()].ideal_attendees_topic_id.value_counts()

industry, team, product    76
sam ladner, erika hall     55
experience, jared spool    52
indi young                 52
Name: ideal_attendees_topic_id, dtype: int64

## What advice do they have for the Rosenfeld Media team in pursuing a conference?
q23 recommendations

In [66]:
nlp.show_column_keywords(likely.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(1,7))

['dont',
 'create',
 'schedule',
 'relevant',
 'event',
 'looking',
 'forward',
 'looking forward',
 'nice',
 'bring',
 'practitioner',
 'researcher',
 'academic',
 'position',
 'similar',
 'tool',
 'strategy',
 'key',
 'opinion',
 'leader']

In [31]:
#words
recommendations_df = find_word_counts(likely.recommendations_text, max_df=.5, stop_words=stopWords, ngram_range=(1,3))
recommendations_df.sort_values(by='count_list').tail(10)

Unnamed: 0,word_list,count_list
335,new,21
332,need,23
543,topic,23
177,experience,24
193,focus,26
474,speaker,26
38,attend,26
424,researcher,32
266,just,41
511,talk,42


In [68]:
nlp.show_column_keywords(likely.recommendations_text, max_df=.9, stop_words=stopWords, ngram_range=(2,7))

['looking forward',
 'good luck',
 'event know',
 'ux researcher',
 'advice talk',
 'networking event',
 'introvert extrovert',
 'food good',
 'good mix',
 'mix speaker',
 'way learn',
 'high quality',
 'want attend',
 'healthy food',
 'food option',
 'time break',
 'talk specific',
 'leading team',
 'thought leader',
 'lightning talk']

In [32]:
#ngrams
recommendations_df = find_word_counts(likely.recommendations_text, max_df=.5, stop_words=stopWords, ngram_range=(2,3))
recommendations_df.sort_values(by='count_list').tail(5)

Unnamed: 0,word_list,count_list
49,point view,3
14,consider having,3
33,learn new,3
0,able attend,3
10,case study,5


In [44]:
likely[likely.recommendations_text.notnull()].recommendations_topic_id.value_counts()

speaker, industry      67
group, career, city    57
event, opportunity     52
good, know             45
field, survery         41
Name: recommendations_topic_id, dtype: int64

## Top Documents per Topic

In [45]:
doc_term_matrix, count_vect = nlp.create_wordcount_matrix(likely.recommendations_text, max_df=.3, ngram=(1,3), stop_words=stopWords)

LDA = LatentDirichletAllocation(n_components=4, random_state=42)

LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=4, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [46]:
recommendations_dict = {0 : 'speaker, industry', 
                        1 : 'event, opportunity', 
                        2 : 'good, know', 
                        3 : 'field, survery', 
                        4 : 'group, career, city'}

In [47]:
nlp.find_top_documents_per_topic(LDA.transform(doc_term_matrix), likely.recommendations_text, 5)

Top 5 Documents for Topic 0: 

Document 1
I think more events for experienced practitioners would be nice. I’m not sure how’d you’d do that at a conference, but, in general, it’s nice to have space for people who are trying to advance research at their organization. A lot of events seem geared toward getting research off the ground or breaking into the business, which I also love, but sometimes it’s nice to have a conversation that goes beyond the basics. 

Document 2
Short periods of structured interaction are good - NOT "find a buddy and learn about each other" but more like group speed dating: "Create a circle of 5-6 chairs and for 10 minutes your topic is X." 

Provide ways to text Qs to a moderator, who can then ask them of the speaker at the end of the talk - because if I have to ask a Q with a microphone, I will be too distracted by that to concentrate on the talk itself.

Lots of bottled water for breaks - conferences always run out and leave nothing but flavored drinks.




Do