In [16]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_pickle('data/sent2.pkl')

In [49]:
def mergep(text):
    analysts = ['INTJ','INTP','ENTJ','ENTP']
    diplomats = ['INFJ','INFP','ENFJ','ENFP']
    sentinels = ['ISTJ','ISFJ','ESTJ','ESFJ']
    explorers = ['ISTP','ISFP','ESTP','ESFP']
    if text in analysts:
        return 'Analyst'
    elif text in diplomats:
        return 'Diplomat'
    elif text in sentinels:
        return 'Sentinel'
    elif text in explorers:
        return 'Explorer'
    else:
        return None
    
def cats4(text):
    EJ = ['ENTJ','ENFJ','ESFJ','ESTJ']
    IJ = ['INTJ','INFJ','ISTJ','ISFJ']
    EP = ['ENTP','ENFP','ESTP','ESFP']
    IP = ['INTP','INFP','ISTP','ISFP']
    if text in EJ:
        return 'EJ'
    elif text in IJ:
        return 'IJ'
    elif text in EP:
        return 'EP'
    elif text in IP:
        return 'IP'
    else:
        return None
    
def cat2(text):
    if text == 'Analyst' or text == 'Explorer':
        return "AnEx"
    elif text == 'Diplomat' or text == 'Sentinel':
        return "DiSe"

In [4]:
df['cat'] = df.type.apply(lambda x: mergep(x))

In [7]:
df.cat.value_counts()

Diplomat    4167
Analyst     3311
Explorer     745
Sentinel     452
Name: cat, dtype: int64

In [47]:
df['cat4'] = df.type.apply(lambda x: cats4(x))

In [48]:
df.cat4.value_counts()

IP    3744
IJ    2932
EP    1497
EJ     502
Name: cat4, dtype: int64

In [50]:
df['cat2'] = df.cat.apply(lambda x: cat2(x))

In [51]:
df.cat2.value_counts()

DiSe    4619
AnEx    4056
Name: cat2, dtype: int64

In [8]:
X = df.posts
y = df.cat

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 29)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))



CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.15 µs
accuracy 0.7548981943910872
              precision    recall  f1-score   support

     Analyst       0.75      0.82      0.78      1013
    Diplomat       0.76      0.89      0.82      1243
    Explorer       0.82      0.08      0.15       217
    Sentinel       1.00      0.05      0.09       130

   micro avg       0.75      0.75      0.75      2603
   macro avg       0.83      0.46      0.46      2603
weighted avg       0.77      0.75      0.71      2603

[[ 831  182    0    0]
 [ 129 1110    4    0]
 [  91  108   18    0]
 [  58   66    0    6]]


In [13]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', GradientBoostingClassifier(n_estimators=100, learning_rate=.01,
                                 max_depth=3, random_state=29)),
               ])

gbc.fit(X_train,y_train)

%time

y_pred = gbc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.2 µs
accuracy 0.7429888590088359
              precision    recall  f1-score   support

     Analyst       0.80      0.69      0.74      1013
    Diplomat       0.71      0.91      0.80      1243
    Explorer       0.82      0.35      0.49       217
    Sentinel       0.68      0.25      0.36       130

   micro avg       0.74      0.74      0.74      2603
   macro avg       0.75      0.55      0.60      2603
weighted avg       0.75      0.74      0.73      2603



In [35]:
from sklearn.preprocessing import LabelEncoder

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.preprocessing import text
from tensorflow.keras import utils

max_words = 2000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(X_train)

x_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(X_test)

encoder = LabelEncoder()
encoder.fit(y_train)
Y_train = encoder.transform(y_train)
Y_test = encoder.transform(y_test)

num_classes = np.max(Y_train) + 1
Y_train = utils.to_categorical(Y_train, num_classes)
Y_test = utils.to_categorical(Y_test, num_classes)

In [43]:
batch_size = 500
epochs = 8

# Build the model
model = Sequential()
model.add(Dense(500, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
model.fit(x_train, Y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_split=0.1)

Train on 5464 samples, validate on 608 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x1a3dd837b8>

In [44]:
y_pred = model.predict_classes(x_test)
y_test2 =[]
for i in range(len(Y_test)):
    y_test2.append(Y_test[i].argmax())

In [45]:
print('accuracy %s' % accuracy_score(y_pred, y_test2))
print(classification_report(y_test2, y_pred))
print(confusion_matrix(y_test2,y_pred))

accuracy 0.708029197080292
              precision    recall  f1-score   support

           0       0.71      0.74      0.72      1013
           1       0.72      0.84      0.78      1243
           2       0.54      0.21      0.30       217
           3       0.47      0.06      0.11       130

   micro avg       0.71      0.71      0.71      2603
   macro avg       0.61      0.46      0.48      2603
weighted avg       0.69      0.71      0.68      2603

[[ 747  249   14    3]
 [ 179 1042   19    3]
 [  72   96   46    3]
 [  58   58    6    8]]


In [52]:
X = df.posts
y = df.cat2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 29)

In [53]:
sgd.fit(X_train, y_train)

%time

y_pred = sgd.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))



CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs
accuracy 0.8321167883211679
              precision    recall  f1-score   support

        AnEx       0.82      0.83      0.82      1230
        DiSe       0.84      0.84      0.84      1373

   micro avg       0.83      0.83      0.83      2603
   macro avg       0.83      0.83      0.83      2603
weighted avg       0.83      0.83      0.83      2603

[[1015  215]
 [ 222 1151]]


In [54]:
gbc.fit(X_train,y_train)

%time

y_pred = gbc.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.91 µs
accuracy 0.7737226277372263
              precision    recall  f1-score   support

        AnEx       0.78      0.73      0.75      1230
        DiSe       0.77      0.81      0.79      1373

   micro avg       0.77      0.77      0.77      2603
   macro avg       0.77      0.77      0.77      2603
weighted avg       0.77      0.77      0.77      2603

[[ 899  331]
 [ 258 1115]]


In [55]:
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(X_train)

x_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(X_test)

encoder = LabelEncoder()
encoder.fit(y_train)
Y_train = encoder.transform(y_train)
Y_test = encoder.transform(y_test)

num_classes = np.max(Y_train) + 1
Y_train = utils.to_categorical(Y_train, num_classes)
Y_test = utils.to_categorical(Y_test, num_classes)

In [56]:
batch_size = 500
epochs = 8

# Build the model
model = Sequential()
model.add(Dense(500, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
model.fit(x_train, Y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_split=0.1)

Train on 5464 samples, validate on 608 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x1a21a750f0>

In [57]:
y_pred = model.predict_classes(x_test)
y_test2 =[]
for i in range(len(Y_test)):
    y_test2.append(Y_test[i].argmax())

In [58]:
print('accuracy %s' % accuracy_score(y_pred, y_test2))
print(classification_report(y_test2, y_pred))
print(confusion_matrix(y_test2,y_pred))

accuracy 0.7591240875912408
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      1230
           1       0.77      0.77      0.77      1373

   micro avg       0.76      0.76      0.76      2603
   macro avg       0.76      0.76      0.76      2603
weighted avg       0.76      0.76      0.76      2603

[[ 914  316]
 [ 311 1062]]
