In [1]:
import nltk

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re

## Let's get the data and clean it up a bit

In [12]:
all_mbti = pd.read_csv('data/Essay_data.csv')

# List of mbti types 
type_labels = ['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 
               'ISTP', 'ISFP', 'INFP', 'INTP', 
               'ESTP', 'ESFP', 'ENFP', 'ENTP', 
               'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ']

In [13]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,Essay
0,I,S,T,J,My first 4 months at the EDSA have been filled...
1,I,N,F,J,I joined the academy being at a crossroads of ...
2,E,N,F,J,so far my experience has been positive and i c...
3,I,N,F,J,I have been very fortunate to have the opportu...
4,I,N,T,J,Looking back to when one got to the academy an...


## Removing Noise

In [8]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)

### Remove punctuation

In [14]:
# first we make everything lower case to remove some noise from capitalisation
all_mbti['Essay'] = all_mbti['Essay'].str.lower()

In [11]:
import string
# these are the chars that count as punctuation. Let's remove the punctuation
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [12]:
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [13]:
all_mbti['post'] = all_mbti['post'].apply(remove_punctuation)

In [19]:
all_mbti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67 entries, 0 to 66
Data columns (total 5 columns):
I/E      67 non-null object
N/S      67 non-null object
T/F      67 non-null object
J/P      67 non-null object
Essay    64 non-null object
dtypes: object(5)
memory usage: 2.7+ KB


## [Tokenising](http://www.nltk.org/howto/tokenize.html) 

A tokenizer divides text into a sequence of tokens, which roughly correspond to "words". (see the [Stanford Tokeniser](https://nlp.stanford.edu/software/tokenizer.shtml))  We will use tokenisers to clean up the data, making it ready for analysis.

In [9]:
#nltk.download('punkt')
from nltk.tokenize import word_tokenize, TreebankWordTokenizer

In [26]:
all_mbti['Essay'] = all_mbti['Essay'].astype(str)

In [27]:
# we will use the TreeBankWordTokenizer since it is MUCH quicker than the word_tokenise function
tokeniser = TreebankWordTokenizer()
all_mbti['tokens'] = all_mbti['Essay'].apply(tokeniser.tokenize)

## [Lemmatization](https://pythonprogramming.net/lemmatizing-nltk-tutorial/)

In [19]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/wahe3bru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
cat
cactus
goose
rock
python
good
best
run
run


In [20]:
def mbti_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]    

In [None]:
# lemmatize all words in dataframe
all_mbti['lemma'] = all_mbti['tokens'].apply(mbti_lemma, args=(lemmatizer, ))

In [None]:
for i, t in enumerate(all_mbti.iloc[268702]['tokens']):    
    print ('{:20s} --> {:10s}'.format(t, all_mbti.iloc[268702]['lemma'][i]))

## [Stop Words](http://johnlaudun.org/20130126-nltk-stopwords/)

Stop words are words which do not contain important significance to be used in Search Queries. Usually these words are filtered out from search queries because they return a vast amount of unnecessary information.  See this [blog post](http://xpo6.com/list-of-english-stop-words/) for more information.

In [28]:
from nltk.corpus import stopwords

In [29]:
sorted(stopwords.words('english'))[0:10]

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']

In [30]:
def remove_stop_words(tokens):    
    return [t for t in tokens if t not in stopwords.words('english')]

Let's leave the stop words in for now so that we can test the following **Hypothesis**:
* Introverts tend to use the word **`I`** more than extroverts
* Conversely, Extroverts tend to favour the word **`you`**

In case you want to run the analysis again without stop words! Be warned, this can take long with the pandas apply function

In [31]:
all_mbti['tokens'] = all_mbti['tokens'].apply(remove_stop_words)

In [48]:
all_mbti.drop('Essay', axis=1, inplace=True)

In [50]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,tokens
0,I,S,T,J,"[first, 4, months, edsa, filled, many, new, ex..."
1,I,N,F,J,"[joined, academy, crossroads, sorts, life., ac..."
2,E,N,F,J,"[far, experience, positive, definitely, see, v..."
3,I,N,F,J,"[fortunate, opportunity, join, academy, year.,..."
4,I,N,T,J,"[looking, back, one, got, academy, right, ,, c..."


In [35]:
all_mbti['type'] = all_mbti['I/E'] + all_mbti['N/S'] + all_mbti['T/F'] + all_mbti['J/P']

In [106]:
I = all_mbti[all_mbti['I/E']=='I']['tokens']

In [107]:
I.head()

0    [first, 4, months, edsa, filled, many, new, ex...
1    [joined, academy, crossroads, sorts, life., ac...
3    [fortunate, opportunity, join, academy, year.,...
4    [looking, back, one, got, academy, right, ,, c...
5    [overall, experience, academy, far, great, som...
Name: tokens, dtype: object

In [67]:
E = all_mbti[all_mbti['I/E']=='E']['tokens']

In [135]:
E.head()

2     [far, experience, positive, definitely, see, v...
13    [journey, exciting., stress, constructive, sen...
16    [coming, academy, best, thing, ever, happened,...
19    [experience, far, great, though, first, bit, a...
25    [experience, explore, data, science, academy, ...
Name: tokens, dtype: object

In [68]:
N = all_mbti[all_mbti['N/S']=='N']['tokens']

In [69]:
S = all_mbti[all_mbti['N/S']=='S']['tokens']

In [70]:
T = all_mbti[all_mbti['T/F']=='T']['tokens']

In [206]:
T.head()

0     [first, 4, months, edsa, filled, many, new, ex...
4     [looking, back, one, got, academy, right, ,, c...
7     [attending, explore, data, science, academy, a...
8     [wonderful, place, ,, place, waking, morning, ...
10    [essay, discussing, personal, experience, rega...
Name: tokens, dtype: object

In [77]:
F = all_mbti[all_mbti['T/F']=='F']['tokens']

In [166]:
J = all_mbti[all_mbti['J/P']=='J']['tokens']

In [89]:
P = all_mbti[all_mbti['J/P']=='P']['tokens']

In [80]:
P.head()

5     [overall, experience, academy, far, great, som...
6     [got, academy, ,, felt, like, n't, belong, her...
8     [wonderful, place, ,, place, waking, morning, ...
9     [start, academy, quite, tricking, due, meeting...
12    [experience, academy, enjoyable, far., learnin...
Name: tokens, dtype: object

### [Bag of words](https://www.packtpub.com/mapt/book/application_development/9781849513609/7/ch07lvl1sec73/bag-of-words-feature-extraction)

Text feature extraction is the process of transforming what is essentially a list of words into a feature set that is usable by a classifier. The NLTK classifiers expect dict style feature sets, so we must therefore transform our text into a dict. The Bag of Words model is the simplest method; it constructs a word presence feature set from all the words of an instance.

In [32]:
def bag_of_words_count(words, word_dict={}):
    """ this function takes in a list of words and returns a dictionary 
        with each word as a key, and the value represents the number of 
        times that word appeared"""
    for word in words:
        if word in word_dict.keys():
            word_dict[word] += 1
        else:
            word_dict[word] = 1
    return word_dict

In [217]:
JtXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [218]:
ItXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [219]:
EtXt

"  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team deliver take responsibility. difficult step back trust team work , however cases team members exceeded expectations. felt bit frustrated team members applied taken substandard lazy approach problems. like get stuck something get immediate results improve there. think approach worked well however also learnt allot team members structured task based approach. allot time spent tangents add value projects however tangents either paid results learnings took them. decision making approach generate ideas create structured work flow around ideas distribute work among individuals take interested work flow. conflicting ideas typically took democratic approach combined healthy debate. find build relationships people 'm teams with. get know best. strongest connections team members also people academy learn from. took team bit data collecting tangent last sprin

In [220]:
NtXt

"  joined academy crossroads sorts life. academy offered opportunity pivot career engineering data science unfairly dismissed struggling find employment result. meeting 100+ ( faculty students ) new people would part life whole year something get used , even though worked multinational companies. ability deal people different backgrounds , beliefs , etc. tested group work. needless say lead tension one 's life team mates pulling weight. team dynamics always learning process/curve , thankfully bad experience till now. needless say , people try cheat system , act dishonestly sometimes plain lazy. many opinionated thus rendered incapable making great contribution. way things always consider everyone 's well-being , opinions contributions. somehow persons think ok cynical unhelpful. end day , supposed learn become data scientists. work , improve daily. .  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team

In [221]:
StXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [226]:
TtXt

"  first 4 months edsa filled many new experiences ; challenges , others joyful , resulted positive growth person. -- -meeting new people -- - enjoy meeting new people everyone met edsa pleasant. particularly enjoyed listening backgrounds strongest candidates within edsa , provided insight gained experience tools like python & sql. also cherish able build network data scientists tomorrow well established industry professionals.. -- -working together lab -- - working computer lab formal work environment comes pros & cons. 's useful 99 colleagues , supervisors & facilitators provide technical/personal support net necessary. obvious con would noise distraction - least say know trending music albums 2018 , win ! -- -working teams & team dynamics -- - 've worked team projects solo projects professional work environment. enjoy both. experience would say team works best 's team co-ordinator plan work divided conquered ( provided everybody takes full responsibility part ) members first group n

In [215]:
FtXt

"  joined academy crossroads sorts life. academy offered opportunity pivot career engineering data science unfairly dismissed struggling find employment result. meeting 100+ ( faculty students ) new people would part life whole year something get used , even though worked multinational companies. ability deal people different backgrounds , beliefs , etc. tested group work. needless say lead tension one 's life team mates pulling weight. team dynamics always learning process/curve , thankfully bad experience till now. needless say , people try cheat system , act dishonestly sometimes plain lazy. many opinionated thus rendered incapable making great contribution. way things always consider everyone 's well-being , opinions contributions. somehow persons think ok cynical unhelpful. end day , supposed learn become data scientists. work , improve daily. .  far experience positive definitely see value course. experience working different people valuable me. worked well attitude everyone team

In [214]:
PtXt

"  overall experience academy far great somewhat challenging. firstly team leader elected team ; tasks disturbed among members. disagreement team meetings. achieved aim , members satisfied. team achieved aim could done time. allocation work workloads decided team leader discussing team. arguments workload. team leader assumed everyone level. questions answered answered satisfy members. individual members needed extra research questions answered properly. team able work together , reliable tried level best punctual meetings. allocated different tasks person researched thoroughly co-ordinated findings together team asked support needed. personally helped individual confident , however bit strain keep mates vast experience programming statistics sometimes felt incompetent tasks given , helped gain knowledge pushed learn harder. role played team members team player , several suggestions made everyone team adopted. team selective adopting new ideas. team worked well , agreements reached alm

In [223]:
def all_the_words(words):
    global TtXt
    txt=" "
    for word in words:
        a = "".join(word)
        txt += ' '+a
    TtXt += txt
    return txt

In [224]:
Etxt = T.apply(all_the_words)

In [368]:
len(TtXt)

51460

In [230]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,tokens
0,I,S,T,J,"[first, 4, months, edsa, filled, many, new, ex..."
1,I,N,F,J,"[joined, academy, crossroads, sorts, life., ac..."
2,E,N,F,J,"[far, experience, positive, definitely, see, v..."
3,I,N,F,J,"[fortunate, opportunity, join, academy, year.,..."
4,I,N,T,J,"[looking, back, one, got, academy, right, ,, c..."


In [231]:
all_mbti['type']= all_mbti['I/E']+all_mbti['N/S']+all_mbti['T/F']+all_mbti['J/P']

In [232]:
all_mbti.head()

Unnamed: 0,I/E,N/S,T/F,J/P,tokens,type
0,I,S,T,J,"[first, 4, months, edsa, filled, many, new, ex...",ISTJ
1,I,N,F,J,"[joined, academy, crossroads, sorts, life., ac...",INFJ
2,E,N,F,J,"[far, experience, positive, definitely, see, v...",ENFJ
3,I,N,F,J,"[fortunate, opportunity, join, academy, year.,...",INFJ
4,I,N,T,J,"[looking, back, one, got, academy, right, ,, c...",INTJ


In [374]:
all_mbti['type'].value_counts()

INTJ    12
INTP    11
INFJ     8
ENFP     4
ENFJ     4
ISFP     4
ISTJ     4
ESFJ     3
INFP     3
ISTP     2
ENTP     2
ESFP     2
ESTJ     2
ISFJ     2
E???     1
ESTP     1
I?T?     1
ENTJ     1
Name: type, dtype: int64

---
MBTI | amount | %EDSApop | %GENpop
--- | --- | --- | ---
INTJ   | 12 |17.9| 2.1  |
INTP   | 11 |16.4| 3.3  |
INFJ   |  8 |11.9| 1.5  |
ENFP   |  4 |6| 8.1  |
ENFJ   |  4 |6| 2.5  |
ISFP   |  4 |6| 1.5  |
ISTJ   |  4 |6| 11.6 |
ESFJ   |  3 |4.5| 12.3 |
INFP   |  3 |4.5| 4.4  |
ISTP   |  2 |3| 5.4  |
ENTP   |  2 |3| 3.2  |
ESFP   |  2 |3| 8.5  |
ESTJ   |  2 |3| 8.7  |
ISFJ   |  2 |3| 13.8 |
E???   |  1 |1.5|
ESTP   |  1 |1.5| 4.3  |
I?T?   |  1 |1.5|
ENTJ   |  1 |1.5| 1.8  |

In [233]:
# here we create a set of dictionaries
# one for each of the MBTI types
personality = {}
for pp in type_labels:
    df = all_mbti.groupby('type')
    personality[pp] = {}
    for row in df.get_group(pp)['tokens']:
        personality[pp] = bag_of_words_count(row, personality[pp])   

In [234]:
personality.keys()

dict_keys(['ISTJ', 'ISFJ', 'INFJ', 'INTJ', 'ISTP', 'ISFP', 'INFP', 'INTP', 'ESTP', 'ESFP', 'ENFP', 'ENTP', 'ESTJ', 'ESFJ', 'ENFJ', 'ENTJ'])

In [282]:
ENTJwords = sorted(personality['ENTJ'],key=personality['ENTJ'].get, reverse=True)

In [361]:
len(ENTJwords)

233

In [318]:
my_stopwords2 = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along',
                 'already', 'also', 'although', 'always', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 
                 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 
                 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
                 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 
                 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 
                 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 
                 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 
                 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 
                 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'hence', 'here', 
                 'hereafter', 'hereby', 'herein', 'hereupon', 'how', 'however', 
                 'hundred', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 
                 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'meanwhile', 'might', 
                 'mill', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'name', 'namely', 
                 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'nothing', 
                 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 
                 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same',
                 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'should', 'show', 'side', 'since', 
                 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 
                 'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'then', 'thence', 
                 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 
                 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 
                 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'well', 
                 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 
                 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 
                 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']



In [362]:
I_set = set(INTPwords + ISTJwords + INFJwords + INTJwords + ISTPwords + ISFPwords + ISFJwords + INFPwords)
E_set = set(ENFPwords + ESFJwords + ESFPwords + ESTPwords + ENTPwords + ESTJwords + ENTJwords + ENFJwords) 
my_stopwords2;

In [344]:
set(INFPwords[:150]).difference(all_set) 


{'appeared',
 'belong',
 'blessing',
 'caught',
 'classmates',
 'currently',
 'curse.',
 'doing.',
 'feeling.',
 'losing',
 'minded',
 'pack.'}

In [None]:
# INTPwords
INTPwords, ISTJwords, ISFJwords, ENTJwords, ESFJwords, ESTPwords, ENFPwords

---
Introvert vs Extrovert

In [363]:
print(f'Number of words used by both: {len(I_set.intersection(E_set))}')

Number of words used by both: 868


In [364]:
print(f'Extrovert unique words: {len(E_set.difference(I_set))}')

Extrovert unique words: 632


In [365]:
print(f'Introvert unique words: {len(I_set.difference(E_set))}')

Introvert unique words: 1612


In [366]:
print(f'Total number of words: {len(I_set.union(E_set))}')

Total number of words: 3112
