### Imports

In [50]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# LemmaTokenizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# Modelling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# classifiers
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# Metrics
from sklearn.metrics import classification_report, confusion_matrix


### Import and Prep data

In [3]:
mbti = pd.read_csv('data/train.csv')
mbti.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [4]:
all_mbti = []
for i, row in mbti.iterrows():
    for post in row['posts'].split('|||'):
        all_mbti.append([row['type'], post])
all_mbti = pd.DataFrame(all_mbti, columns=['type', 'post'])

In [5]:
all_mbti.head()

Unnamed: 0,type,post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...


Split MBTI type into individual features for Binary Classification

In [6]:
def split_type(row):
    ''' 
       Split MBTI type into individual features for Binary Classification
       eg ENFP = [0 1 0 0]
    '''
    t = row['type']
    I,N,T,J = 0,0,0,0
    
    I = 1 if t[0] =='I' else 0
    
    N = 1 if t[1] == 'N' else 0
    
    T = 1 if t[2] == 'T' else 0
    
    J = 1 if t[3] == 'J' else 0
    
    return pd.Series( {'IE':I, 'NS':N , 'TF': T, 'JP': J }) 

In [7]:
all_mbti = all_mbti.join(all_mbti.apply(lambda row: split_type(row), axis=1))
all_mbti.head(5)

Unnamed: 0,type,post,IE,JP,NS,TF
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,1,1,1,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,1,1,1,0
2,INFJ,enfp and intj moments https://www.youtube.com...,1,1,1,0
3,INFJ,What has been the most life-changing experienc...,1,1,1,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,1,1,1,0


In [8]:
# rearrange columns
all_mbti = all_mbti[['type', 'IE', 'NS', 'TF', 'JP', 'post']]
all_mbti.head(5)

Unnamed: 0,type,IE,NS,TF,JP,post
0,INFJ,1,1,0,1,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,1,1,0,1,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,1,1,0,1,enfp and intj moments https://www.youtube.com...
3,INFJ,1,1,0,1,What has been the most life-changing experienc...
4,INFJ,1,1,0,1,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...


Note: create_function to run here 1st

---
### Clean the text

creating the LemmaTokenizer removes the default tokenizer, therefore cleaning of the data has to be done manually

In [11]:
all_mbti['post'].head(10)

0          'http://www.youtube.com/watch?v=qsXHcwe3krw
1    http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2    enfp and intj moments  https://www.youtube.com...
3    What has been the most life-changing experienc...
4    http://www.youtube.com/watch?v=vXZeYwwRDw8   h...
5                 May the PerC Experience immerse you.
6    The last thing my INFJ friend posted on his fa...
7    Hello ENFJ7. Sorry to hear of your distress. I...
8    84389  84390  http://wallpaperpassion.com/uplo...
9                                   Welcome and stuff.
Name: post, dtype: object

In [12]:
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
all_mbti['post'] = all_mbti['post'].replace(to_replace = pattern_url, value = subs_url, regex = True)

In [15]:
all_mbti['post'] = all_mbti['post'].str.replace('\d+', '') # remove digits

In [17]:
all_mbti['post'] = all_mbti['post'].str.replace(r'(\b\w{1,2}\b)', '') # remove words less than 3 letters

In [19]:
all_mbti['post'] = all_mbti['post'].str.replace('[^\w\s]', '') # remove punctuation 

In [20]:
all_mbti['post'].head(10)

0                                               urlweb
1                                               urlweb
2    enfp and intj moments  urlweb  sportscenter no...
3    What has been the most lifechanging experience...
4             urlweb   urlweb   repeat for most  today
5                  May the PerC Experience immerse you
6    The last thing  INFJ friend posted  his facebo...
7    Hello ENFJ Sorry  hear  your distress  only na...
8                                      urlweb  urlweb 
9                                    Welcome and stuff
Name: post, dtype: object

---
### Custom builds

build my own tokenizer that lemmatizes as well

In [24]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

tf_vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(),
                                strip_accents = 'unicode', # works 
                                stop_words = 'english', # works
                                lowercase = True, # works
                                max_df = 0.5, # works
                                min_df = 10) # works

bespoke stopwords list

In [25]:
# my_stopwords1

# my_stopwords1

### Train, test split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(all_mbti['post'], all_mbti['IE'], test_size=0.3, 
                                                    random_state=42)

### Building Pipelines

##### baseline

In [29]:
baseline = Pipeline([('cnt_vec', CountVectorizer()),
                     ('bayes', MultinomialNB())
                    ])

In [43]:
baseline_lem = Pipeline([('lem_vec', CountVectorizer(tokenizer=LemmaTokenizer())),
                          ('bayes', MultinomialNB())
                        ])

In [57]:
baseline_tfidf = Pipeline([('lem_vec', TfidfVectorizer(tokenizer=LemmaTokenizer())),
                           ('bayes', MultinomialNB())
                         ])

try all the models

### Assessing Builds

In [55]:
import timeit

start = timeit.default_timer()

#Your statements here
base_lem_pred = baseline_lem.predict(X_test)

stop = timeit.default_timer()

print(stop - start) 


28.199765253986698


In [69]:
pipelines = [baseline,baseline_lem, baseline_tfidf]
names_of_pipes = ['baseline', 'baseline_lem', 'baseline_tfidf']

Loop through the Pipelines:

In [70]:
for name, pipe in zip(names_of_pipes, pipelines):
    print(f'For model {name}:')
    
    # time model training
    start = timeit.default_timer()
    pipe.fit(X_train, y_train)
    stop = timeit.default_timer()
    print('The model took {:.3f} seconds to train'.format(stop - start)) 
    
    # score train vs test
    print("\nAccuracy on training set: {:.3f}".format(pipe.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(pipe.score(X_test, y_test)))
    
    # Assessment
    y_pred = pipe.predict(X_test)
    print('\nConfusion Matrix:')
    print(confusion_matrix(y_pred, y_test))
    print('\nClassification Report:')
    print(classification_report(y_pred, y_test))
    
    print('---------------------------------------------\n')

For model baseline:
The model took 5.248 seconds to train

Accuracy on training set: 0.808
Accuracy on test set: 0.760

Confusion Matrix:
[[ 2050  2636]
 [20123 70156]]

Classification Report:
             precision    recall  f1-score   support

          0       0.09      0.44      0.15      4686
          1       0.96      0.78      0.86     90279

avg / total       0.92      0.76      0.83     94965

---------------------------------------------

For model baseline_lem:
The model took 66.992 seconds to train

Accuracy on training set: 0.806
Accuracy on test set: 0.761

Confusion Matrix:
[[ 2035  2568]
 [20138 70224]]

Classification Report:
             precision    recall  f1-score   support

          0       0.09      0.44      0.15      4603
          1       0.96      0.78      0.86     90362

avg / total       0.92      0.76      0.83     94965

---------------------------------------------

For model baseline_tfidf:
The model took 70.615 seconds to train

Accuracy on trainin

In [51]:
import timeit

# using setitem
t = timeit.Timer("print('main statement')", "print('setup')")

print('TIMEIT:')
print(t.timeit(1))

TIMEIT:
setup
main statement
0.00047029199777171016
