In [1]:
import nltk

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

## Let's get the data and clean it up a bit

In [12]:
all_mbti = pd.read_csv('data/Essay_data.csv')

# List of mbti types 
type_labels = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 
               'ISTP', 'ISFP', 'INFP', 'INTP', 
               'ESTP', 'ESFP', 'ENFP', 'ENTP', 
               'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']

In [13]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,Essay
0,I,S,T,J,My first 4 months at the EDSA have been filled...
1,I,N,F,J,I joined the academy being at a crossroads of ...
2,E,N,F,J,so far my experience has been positive and i c...
3,I,N,F,J,I have been very fortunate to have the opportu...
4,I,N,T,J,Looking back to when one got to the academy an...


## Removing Noise

In [8]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)

### Remove punctuation

In [14]:
# first we make everything lower case to remove some noise from capitalisation
all_mbti['Essay'] = all_mbti['Essay'].str.lower()

In [11]:
import string
# these are the chars that count as punctuation. Let's remove the punctuation
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [13]:
all_mbti['post'] = all_mbti['post'].apply(remove_punctuation)

In [19]:
all_mbti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 5 columns):
I/E      67 non-null object
N/S      67 non-null object
T/F      67 non-null object
J/P      67 non-null object
Essay    64 non-null object
dtypes: object(5)
memory usage: 2.7+ KB


## [Tokenising](http://www.nltk.org/howto/tokenize.html) 

A tokenizer divides text into a sequence of tokens, which roughly correspond to "words". (see the [Stanford Tokeniser](https://nlp.stanford.edu/software/tokenizer.shtml))  We will use tokenisers to clean up the data, making it ready for analysis.

In [9]:
#nltk.download('punkt')
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [26]:
all_mbti['Essay'] = all_mbti['Essay'].astype(str)

In [27]:
# we will use the TreeBankWordTokenizer since it is MUCH quicker than the word_tokenise function
tokeniser = TreebankWordTokenizer()
all_mbti['tokens'] = all_mbti['Essay'].apply(tokeniser.tokenize)

## [Lemmatization](https://pythonprogramming.net/lemmatizing-nltk-tutorial/)

In [19]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wahe3bru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
cat
cactus
goose
rock
python
good
best
run
run


In [20]:
def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]    

In [None]:
# lemmatize all words in dataframe
all_mbti['lemma'] = all_mbti['tokens'].apply(mbti_lemma, args=(lemmatizer, ))

In [None]:
for i, t in enumerate(all_mbti.iloc[268702]['tokens']):    
    print ('{:20s} --> {:10s}'.format(t, all_mbti.iloc[268702]['lemma'][i]))

## [Stop Words](http://johnlaudun.org/20130126-nltk-stopwords/)

Stop words are words which do not contain important significance to be used in Search Queries. Usually these words are filtered out from search queries because they return a vast amount of unnecessary information.  See this [blog post](http://xpo6.com/list-of-english-stop-words/) for more information.

In [28]:
from nltk.corpus import stopwords

In [29]:
sorted(stopwords.words('english'))[0:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [30]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

Let's leave the stop words in for now so that we can test the following **Hypothesis**:
* Introverts tend to use the word **`I`** more than extroverts
* Conversely, Extroverts tend to favour the word **`you`**

In case you want to run the analysis again without stop words! Be warned, this can take long with the pandas apply function

In [31]:
all_mbti['tokens'] = all_mbti['tokens'].apply(remove_stop_words)

In [48]:
all_mbti.drop('Essay', axis=1, inplace=True)

In [50]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,tokens
0,I,S,T,J,"[first, 4, months, edsa, filled, many, new, ex..."
1,I,N,F,J,"[joined, academy, crossroads, sorts, life., ac..."
2,E,N,F,J,"[far, experience, positive, definitely, see, v..."
3,I,N,F,J,"[fortunate, opportunity, join, academy, year.,..."
4,I,N,T,J,"[looking, back, one, got, academy, right, ,, c..."


In [35]:
all_mbti['type'] = all_mbti['I/E'] + all_mbti['N/S'] + all_mbti['T/F'] + all_mbti['J/P']

In [106]:
I = all_mbti[all_mbti['I/E']=='I']['tokens']

In [107]:
I.head()

0    [first, 4, months, edsa, filled, many, new, ex...
1    [joined, academy, crossroads, sorts, life., ac...
3    [fortunate, opportunity, join, academy, year.,...
4    [looking, back, one, got, academy, right, ,, c...
5    [overall, experience, academy, far, great, som...
Name: tokens, dtype: object

In [67]:
E = all_mbti[all_mbti['I/E']=='E']['tokens']

In [135]:
E.head()

2     [far, experience, positive, definitely, see, v...
13    [journey, exciting., stress, constructive, sen...
16    [coming, academy, best, thing, ever, happened,...
19    [experience, far, great, though, first, bit, a...
25    [experience, explore, data, science, academy, ...
Name: tokens, dtype: object

In [68]:
N = all_mbti[all_mbti['N/S']=='N']['tokens']

In [69]:
S = all_mbti[all_mbti['N/S']=='S']['tokens']

In [70]:
T = all_mbti[all_mbti['T/F']=='T']['tokens']

In [206]:
T.head()

0     [first, 4, months, edsa, filled, many, new, ex...
4     [looking, back, one, got, academy, right, ,, c...
7     [attending, explore, data, science, academy, a...
8     [wonderful, place, ,, place, waking, morning, ...
10    [essay, discussing, personal, experience, rega...
Name: tokens, dtype: object

In [77]:
F = all_mbti[all_mbti['T/F']=='F']['tokens']

In [166]:
J = all_mbti[all_mbti['J/P']=='J']['tokens']

In [89]:
P = all_mbti[all_mbti['J/P']=='P']['tokens']

In [80]:
P.head()

5     [overall, experience, academy, far, great, som...
6     [got, academy, ,, felt, like, n't, belong, her...
8     [wonderful, place, ,, place, waking, morning, ...
9     [start, academy, quite, tricking, due, meeting...
12    [experience, academy, enjoyable, far., learnin...
Name: tokens, dtype: object

### [Bag of words](https://www.packtpub.com/mapt/book/application_development/9781849513609/7/ch07lvl1sec73/bag-of-words-feature-extraction)

Text feature extraction is the process of transforming what is essentially a list of words into a feature set that is usable by a classifier. The NLTK classifiers expect dict style feature sets, so we must therefore transform our text into a dict. The Bag of Words model is the simplest method; it constructs a word presence feature set from all the words of an instance.

In [32]:
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

In [217]:
JtXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [218]:
ItXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [219]:
EtXt

"  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team deliver take responsibility. difficult step back trust team work , however cases team members exceeded expectations. felt bit frustrated team members applied taken substandard lazy approach problems. like get stuck something get immediate results improve there. think approach worked well however also learnt allot team members structured task based approach. allot time spent tangents add value projects however tangents either paid results learnings took them. decision making approach generate ideas create structured work flow around ideas distribute work among individuals take interested work flow. conflicting ideas typically took democratic approach combined healthy debate. find build relationships people 'm teams with. get know best. strongest connections team members also people academy learn from. took team bit data collecting tangent last sprin

In [220]:
NtXt

"  joined academy crossroads sorts life. academy offered opportunity pivot career engineering data science unfairly dismissed struggling find employment result. meeting 100+ ( faculty students ) new people would part life whole year something get used , even though worked multinational companies. ability deal people different backgrounds , beliefs , etc. tested group work. needless say lead tension one 's life team mates pulling weight. team dynamics always learning process/curve , thankfully bad experience till now. needless say , people try cheat system , act dishonestly sometimes plain lazy. many opinionated thus rendered incapable making great contribution. way things always consider everyone 's well-being , opinions contributions. somehow persons think ok cynical unhelpful. end day , supposed learn become data scientists. work , improve daily. .  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team

In [221]:
StXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [226]:
TtXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [215]:
FtXt

"  joined academy crossroads sorts life. academy offered opportunity pivot career engineering data science unfairly dismissed struggling find employment result. meeting 100+ ( faculty students ) new people would part life whole year something get used , even though worked multinational companies. ability deal people different backgrounds , beliefs , etc. tested group work. needless say lead tension one 's life team mates pulling weight. team dynamics always learning process/curve , thankfully bad experience till now. needless say , people try cheat system , act dishonestly sometimes plain lazy. many opinionated thus rendered incapable making great contribution. way things always consider everyone 's well-being , opinions contributions. somehow persons think ok cynical unhelpful. end day , supposed learn become data scientists. work , improve daily. .  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team

In [214]:
PtXt

"  overall experience academy far great somewhat challenging. firstly team leader elected team ; tasks disturbed among members. disagreement team meetings. achieved aim , members satisfied. team achieved aim could done time. allocation work workloads decided team leader discussing team. arguments workload. team leader assumed everyone level. questions answered answered satisfy members. individual members needed extra research questions answered properly. team able work together , reliable tried level best punctual meetings. allocated different tasks person researched thoroughly co-ordinated findings together team asked support needed. personally helped individual confident , however bit strain keep mates vast experience programming statistics sometimes felt incompetent tasks given , helped gain knowledge pushed learn harder. role played team members team player , several suggestions made everyone team adopted. team selective adopting new ideas. team worked well , agreements reached alm

In [223]:
def all_the_words(words):
    global TtXt
    txt=" "
    for word in words:
        a = "".join(word)
        txt += ' '+a
    TtXt += txt
    return txt

In [224]:
Etxt = T.apply(all_the_words)

In [225]:
print(TtXt)

  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group no

In [161]:
print(Etxt)

  experience edsa one sparked lot introspection. socially , made affirm fact extroverted person cherishes human connection , communication cares 's well-being. regards team-work , made realize quite assertive prefer logical structure comes completing tasks fulfilling set roles. made also realize end taking charge situation group , even n't initially bargained , mainly seems like n't much direction group time seems going waste. decision- making process rational one mostly times based emotion people would choose certain roles based believing good certain thing interest end n't end achieving/delivering much end burning out. 've also realized things n't go according plan comes activities part role , stress feel defeated easily perfectionist side creeps end completing needs done time. future teams wish branch attempt master every different skill best ability instead focusing much individual role , lot potential know capable .


In [57]:
# here we create a set of dictionaries
# one for each of the MBTI types
personality = {}
type_sub = 'E I N S T F J P'.split()
for pp in type_sub:
    df = all_mbti[pp]
    personality[pp] = {}
    for row in df.get_group(pp)['tokens']:
        personality[pp] = bag_of_words_count(row, personality[pp])       

KeyError: 'E'

In [39]:
personality.keys()

dict_keys(['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ'])

In [None]:
personality['ISTJ']

In [42]:
ISTJwords = sorted(personality['ISTJ'],key=personality['ISTJ'].get)

In [43]:
ISTJwords

['4',
 'months',
 'filled',
 ';',
 'challenges',
 'joyful',
 'positive',
 'growth',
 'person.',
 '-meeting',
 'met',
 'pleasant.',
 'listening',
 'backgrounds',
 'strongest',
 'candidates',
 'insight',
 'gained',
 'sql.',
 'cherish',
 'able',
 'build',
 'network',
 'scientists',
 'tomorrow',
 'established',
 'industry',
 'professionals..',
 'computer',
 'formal',
 'environment',
 'comes',
 'pros',
 'cons.',
 'useful',
 '99',
 'colleagues',
 'technical/personal',
 'support',
 'net',
 'obvious',
 'con',
 'noise',
 'distraction',
 'least',
 'trending',
 'music',
 'albums',
 '2018',
 'win',
 '!',
 "'ve",
 'solo',
 'professional',
 'environment.',
 'divided',
 'conquered',
 'everybody',
 'takes',
 'part',
 'noobs',
 'sql',
 'gather',
 'analyse',
 'phases',
 'exposed',
 'deficiencies',
 'frustration',
 'came',
 'delivering',
 'comfort',
 'zone',
 'patience',
 'all.',
 'high',
 'though',
 'complex',
 'comfortable',
 'dynamic.',
 'consistent',
 'deliberation.',
 'easier',
 'person',
 'presente

In [42]:
# next we create a list of all of the unique words...
all_words = set()
for pp in type_labels:
    for word in personality[pp]:
        all_words.add(word)

In [43]:
# so that we can create a dictionary of bag of words for the whole dataset
personality['all'] = {}
for pp in type_labels:    
    for word in all_words:
        if word in personality[pp].keys():
            if word in personality['all']:
                personality['all'][word] += personality[pp][word]
            else:
                personality['all'][word] = personality[pp][word]

There are a lot of words that only appear once! Let's remove them.

In [45]:
# how many words in total?
sum([v for v in personality['all'].values()])

8206375

In [46]:
# how many words appear only once?
len([v for v in personality['all'].values() if v == 1])

80839

In [47]:
# how many words appear more than 100 times?
# how many words of the total does that account for?
print (len([v for v in personality['all'].values() if v >= 100]))
print (sum([v for v in personality['all'].values() if v >= 100]))

4269
7581200


In [48]:
7581200/8206375

0.9238183729113038

Using words that appear more than 100 times seems much more useful!  And this accounts for 92% of all the words!

In [49]:
max_count = 100
word_index = [k for k, v in personality['all'].items() if v > max_count]

In [50]:
# now let's create one big data frame with the word counts by personality profile
hm = []
for p, p_bow in personality.items():
    df_bow = pd.DataFrame([(k, v) for k, v in p_bow.items() if k in word_index], columns=['Word', p])
    df_bow.set_index('Word', inplace=True)
    hm.append(df_bow)

# create one big data frame
df_bow = pd.concat(hm, axis=1)
df_bow.fillna(0, inplace=True)

In [51]:
# what are the top 10 words that appear most often?
df_bow.sort_values(by='all', ascending=False).head(10)

Unnamed: 0,ISTJ,ISFJ,INFJ,INTJ,ISTP,ISFP,INFP,INTP,ESTP,ESFP,ENFP,ENTP,ESTJ,ESFJ,ENFJ,ENTJ,all
i,8172.0,8044.0,67871,43864,13883.0,11148.0,87712,52115,3704.0,1696.0,31198.0,27403,1856.0,2168.0,8687.0,8881.0,378402
the,5143.0,4111.0,39663,30500,8893.0,6131.0,48008,35868,2191.0,937.0,16454.0,18994,1000.0,1200.0,5019.0,6135.0,230247
to,5106.0,4607.0,40231,28753,8725.0,6264.0,48996,33005,2254.0,972.0,16945.0,17852,1078.0,1223.0,5471.0,5889.0,227371
a,4033.0,3333.0,31932,22780,7124.0,4825.0,40376,26693,1868.0,796.0,13847.0,14728,841.0,986.0,3966.0,4748.0,182876
and,3827.0,3571.0,31628,21568,6540.0,5153.0,40710,24881,1905.0,834.0,15002.0,14236,943.0,988.0,4343.0,4564.0,180693
of,2976.0,2475.0,24312,17857,4962.0,3580.0,29576,21372,1300.0,557.0,10217.0,11335,650.0,779.0,3114.0,3499.0,138561
you,2734.0,2186.0,22221,16010,4696.0,3332.0,24971,17197,1396.0,651.0,10329.0,10882,653.0,639.0,3050.0,3815.0,124762
that,2207.0,2033.0,19445,14436,4055.0,2931.0,23445,16396,1063.0,550.0,8591.0,8949,521.0,615.0,2614.0,2907.0,110758
it,2290.0,2046.0,18376,13179,4244.0,2909.0,22537,15708,1065.0,434.0,8028.0,8265,469.0,490.0,2280.0,2604.0,104924
is,2186.0,1879.0,18237,14293,3704.0,2726.0,21068,15889,1121.0,482.0,7769.0,8904,554.0,565.0,2404.0,3000.0,104781


Thats not very helpful at all, is it! Its very difficult to extract insight from this data.  Lets see if we can use the $chi^2$ test to see whether Introverts favour the word **`I`**

In [52]:
intro_types = [p for p in type_labels if p[0] == 'I']

In [53]:
intro_types

['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP']

In [54]:
df_bow['I'] = df_bow[intro_types].sum(axis=1)

In [55]:
# convert to percentages
for col in ['I', 'all']:
    df_bow[col+'_perc'] = df_bow[col] / df_bow[col].sum()

Do you remember the chi2 test from the CINDY framework?  This looks at observed versus expected results and lets us know where the greatest differences from expected are.  The bigger the statistic, the greater the difference from expectation.  The formula is 

$$ùëê‚Ñéùëñ^2 = \sum{\frac{(ùëÇùëèùë†ùëíùëüùë£ùëíùëë ‚àíùëíùë•ùëùùëíùëêùë°ùëíùëë)^2}{ùëíùë•ùëùùëíùëêùë°ùëíùëë}}$$

In [56]:
df_bow['chi2'] = np.power((df_bow['I_perc'] - df_bow['all_perc']), 2) / df_bow['all_perc']

In [57]:
df_bow[['I_perc', 'all_perc', 'chi2']][df_bow['I_perc'] > df_bow['all_perc']].sort_values(by='chi2', ascending=False).head(10)

Unnamed: 0,I_perc,all_perc,chi2
urlweb,0.002971,0.002756,1.7e-05
infp,0.001318,0.001179,1.6e-05
infj,0.001173,0.001075,9e-06
infps,0.000491,0.000435,7e-06
infjs,0.000415,0.000366,7e-06
intp,0.000988,0.000918,5e-06
my,0.012716,0.012501,4e-06
i,0.05035,0.049933,3e-06
intps,0.000366,0.000332,3e-06
in,0.013036,0.012836,3e-06


And there it is! What can we conclude from this:
* I is the 8th most introverted word, by expectation
* Introverts tend to post more urls than extroverted people too! 
* The introverted types are more likely to be written by Introverts, maybe because people post about their own types?

In [58]:
df_bow[['I_perc', 'all_perc', 'chi2']][df_bow['I_perc'] < df_bow['all_perc']].sort_values(by='chi2', ascending=False).head(20)

Unnamed: 0,I_perc,all_perc,chi2
enfp,0.000479,0.000767,0.000108
entp,0.000397,0.000666,0.000108
entps,0.000119,0.000238,6e-05
enfps,0.000137,0.000241,4.5e-05
entj,0.000259,0.00038,3.8e-05
enfj,0.000288,0.000375,2e-05
estp,0.000231,0.000304,1.7e-05
entjs,6.7e-05,0.000111,1.7e-05
d,0.000378,0.000452,1.2e-05
enfjs,7.6e-05,0.000112,1.2e-05


# Now that we have done all of that, lets cheat!

Praise be to Python...

sklearn has a built in text feature extraction module called [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) that will literally do all that work in one line of code!


This function converts a collection of text documents to a matrix of token counts.

This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

In [60]:
vect = CountVectorizer()

In [61]:
vect.fit(all_mbti['post'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Tuning the vectorizer (discussion)

Thus far, we have been using the default parameters of [CountVectorizer](http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html):

In [62]:
### show default parameters for CountVectorizer
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

However, the vectorizer is worth tuning, just like a model is worth tuning! Here are a few parameters that you might want to tune:

- **stop_words:** string {'english'}, list, or None (default)
    - If 'english', a built-in stop word list for English is used.
    - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
    - If None, no stop words will be used.

In [63]:
# remove English stop words
vect = CountVectorizer(stop_words='english')

- **ngram_range:** tuple (min_n, max_n), default=(1, 1)
    - The lower and upper boundary of the range of n-values for different n-grams to be extracted.
    - All values of n such that min_n <= n <= max_n will be used.

In [64]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2))

- **max_df:** float in range [0.0, 1.0] or int, default=1.0
    - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [65]:
# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)

- **min_df:** float in range [0.0, 1.0] or int, default=1
    - When building the vocabulary, ignore terms that have a document frequency strictly lower than the given threshold. (This value is also called "cut-off" in the literature.)
    - If float, the parameter represents a proportion of documents.
    - If integer, the parameter represents an absolute count.

In [66]:
# only keep terms that appear in at least 2 documents
vect = CountVectorizer(min_df=2)

**Guidelines for tuning CountVectorizer:**

- Use your knowledge of the **problem** and the **text**, and your understanding of the **tuning parameters**, to help you decide what parameters to tune and how to tune them.
- **Experiment**, and let the data tell you the best approach!

In [67]:
betterVect = CountVectorizer(stop_words='english', 
                             min_df=2, 
                             max_df=0.5, 
                             ngram_range=(1, 1))

In [68]:
betterVect.fit(all_mbti['post'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [69]:
len(betterVect.get_feature_names())

50426

## [ngrams](http://www.nltk.org/api/nltk.html?highlight=n%20grams#nltk.util.ngrams)

While individual words do carry meaning, it is often the case that combinations of words change meanings of sentences entirely.  For example, what difference does removing the `not` from this sentence make?

Natural Language processing is **not** easy!

ngrams are a method to extract combinations of words into features for model buildiing.  The `n` in ngrams specifies the number of tokens to include.  For example, a 2-gram returns all the consecutive pairs of words in a sentence

In [70]:
from nltk.util import ngrams

In [71]:
def word_grams(words, min_n=1, max_n=4):
    s = []
    for n in range(min_n, max_n):
        for ngram in ngrams(words, n):
            s.append(' '.join(str(i) for i in ngram))
    return s

In [72]:
print (word_grams('one two three four'.split(' ')))

['one', 'two', 'three', 'four', 'one two', 'two three', 'three four', 'one two three', 'two three four']


In [73]:
[x for x in ngrams(all_mbti.iloc[268702]['tokens'], 2)]

[('i', 'hate'),
 ('hate', 'april'),
 ('april', 'fools'),
 ('fools', 'day'),
 ('day', 'angry'),
 ('angry', 'theres'),
 ('theres', 'a'),
 ('a', 'site'),
 ('site', 'im'),
 ('im', 'regularly'),
 ('regularly', 'on'),
 ('on', 'and'),
 ('and', 'the'),
 ('the', 'admins'),
 ('admins', 'are'),
 ('are', 'screwing'),
 ('screwing', 'everything'),
 ('everything', 'up'),
 ('up', 'today'),
 ('today', 'for'),
 ('for', 'a'),
 ('a', 'laugh'),
 ('laugh', 'but'),
 ('but', 'i'),
 ('i', 'dont'),
 ('dont', 'find'),
 ('find', 'it'),
 ('it', 'funny'),
 ('funny', 'im'),
 ('im', 'actually'),
 ('actually', 'quite'),
 ('quite', 'freaked'),
 ('freaked', 'out'),
 ('out', 'about'),
 ('about', 'it')]

In [74]:
[x for x in ngrams(all_mbti.iloc[268702]['tokens'], 3)]

[('i', 'hate', 'april'),
 ('hate', 'april', 'fools'),
 ('april', 'fools', 'day'),
 ('fools', 'day', 'angry'),
 ('day', 'angry', 'theres'),
 ('angry', 'theres', 'a'),
 ('theres', 'a', 'site'),
 ('a', 'site', 'im'),
 ('site', 'im', 'regularly'),
 ('im', 'regularly', 'on'),
 ('regularly', 'on', 'and'),
 ('on', 'and', 'the'),
 ('and', 'the', 'admins'),
 ('the', 'admins', 'are'),
 ('admins', 'are', 'screwing'),
 ('are', 'screwing', 'everything'),
 ('screwing', 'everything', 'up'),
 ('everything', 'up', 'today'),
 ('up', 'today', 'for'),
 ('today', 'for', 'a'),
 ('for', 'a', 'laugh'),
 ('a', 'laugh', 'but'),
 ('laugh', 'but', 'i'),
 ('but', 'i', 'dont'),
 ('i', 'dont', 'find'),
 ('dont', 'find', 'it'),
 ('find', 'it', 'funny'),
 ('it', 'funny', 'im'),
 ('funny', 'im', 'actually'),
 ('im', 'actually', 'quite'),
 ('actually', 'quite', 'freaked'),
 ('quite', 'freaked', 'out'),
 ('freaked', 'out', 'about'),
 ('out', 'about', 'it')]