# **Data Pipeline**

In [1]:
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import regex as re
import transformers
#import keras
from tensorflow.keras import backend as K
import plotly.express as px


data=pd.read_csv('/kaggle/input/mbti-dataset-project/mbti_1.csv')
data.head()

2022-11-29 03:37:36.262967: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-11-29 03:37:36.263088: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
#Check if TPU is available
use_tpu = True
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.MirroredStrategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  ['10.0.0.2:8470']


2022-11-29 03:37:56.507916: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-11-29 03:37:56.511043: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/conda/lib
2022-11-29 03:37:56.511085: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-29 03:37:56.511110: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (203da84b62b2): /proc/driver/nvidia/version does not exist
2022-11-29 03:37:56.514281: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operation

REPLICAS:  8


In [3]:
px.pie(data,names='type',title='Distribution of personality types',hole=0.3)

**Data is extremely imbalanced.** This might cause overfitting to happen since the total data only amounts to 8675.

In [4]:
data['type'].value_counts()

INFP    1832
INFJ    1470
INTP    1304
INTJ    1091
ENTP     685
ENFP     675
ISTP     337
ISFP     271
ENTJ     231
ISTJ     205
ENFJ     190
ISFJ     166
ESTP      89
ESFP      48
ESFJ      42
ESTJ      39
Name: type, dtype: int64

In [6]:
def clean_text(data):
    data_length=[]
    lemmatizer=WordNetLemmatizer()
    cleaned_text=[]
    for sentence in tqdm(data.posts):
        sentence=sentence.lower()
        
        #removing links from text data
        sentence=re.sub('https?://[^\s<>"]+|www\.[^\s<>"]+',' ',sentence)
    
        #removing other symbols
        sentence=re.sub('[^0-9a-z]',' ',sentence)
    
        
        data_length.append(len(sentence.split()))
        cleaned_text.append(sentence)
    return cleaned_text

In [7]:
data.posts = clean_text(data)
data

  0%|          | 0/8675 [00:00<?, ?it/s]

Unnamed: 0,type,posts
0,INFJ,and intj moments sportscenter not top t...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i ...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconce...
...,...,...
8670,ISFP,just because i always think of cats as fi d...
8671,ENFP,so if this thread already exists someplace ...
8672,INTP,so many questions when i do these things i ...
8673,INFP,i am very conflicted right now when it comes ...


**Initialize BERT Tokenizer and attention masks******

In [8]:
#Split dataset
from sklearn.model_selection import train_test_split

posts = data['posts'].values
labels =  data['type'].values
train_data, test_data = train_test_split(data, random_state=0, test_size=0.2)

train_size = len(train_data)
test_size = len(test_data)
train_size, test_size

(6940, 1735)

In [9]:
#Initialize Bert tokenizer and masks
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

bert_model_name = 'bert-large-uncased'

tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 512

def tokenize_sentences(sentences, tokenizer, max_seq_len = 1800):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,                  # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = max_seq_len,  # Truncate all sentences.
                    )
        
        tokenized_sentences.append(tokenized_sentence)
        
    return tokenized_sentences

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)

train_input_ids = tokenize_sentences(train_data['posts'], tokenizer, MAX_LEN)
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
train_attention_masks = create_attention_masks(train_input_ids)

test_input_ids = tokenize_sentences(test_data['posts'], tokenizer, MAX_LEN)
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input_ids)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/6940 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/1735 [00:00<?, ?it/s]

In [14]:
#train_masks,test_masks, _, _ = train_test_split(attention_masks, labels, random_state=0, test_size=0.2)

In [17]:
#Create train and test datasets
BATCH_SIZE=32 
NR_EPOCHS=10
# def create_dataset(data_tuple, epochs=1, batch_size=32, buffer_size=10000, train=True):
#    dataset = tf.data.Dataset.from_tensor_slices(data_tuple)
#    if train:
#        dataset = dataset.shuffle(buffer_size=buffer_size)
#    dataset = dataset.repeat(epochs)
#    dataset = dataset.batch(batch_size)
#    if train:
#        dataset = dataset.prefetch(1)
    
#    return dataset

# train_dataset = create_dataset((train_inputs, train_masks, train_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE)
# test_dataset = create_dataset((test_inputs, test_masks, test_labels), epochs=NR_EPOCHS, batch_size=BATCH_SIZE, train=False)

# BERT Model
* Load the pretrained BERT base-model from Transformers library
- Take the first hidden-state from BERT output (corresponding to CLS token) and feed it into a Dense layer with 16 neurons and softmax activation

In [None]:

#from transformers import TFBertModel

#from tensorflow.keras.layers import Dense, Flatten

#class BertClassifier(tf.keras.Model):    
#        def __init__(self, bert: TFBertModel, num_classes: int):
#            super().__init__()
#            self.bert = bert
#            self.classifier = Dense(16, activation='softmax')

#        @tf.function
#        def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
#            outputs = self.bert(input_ids,
#                                   attention_mask=attention_mask,
#                                   token_type_ids=token_type_ids,
#                                   position_ids=position_ids,
#                                   head_mask=head_mask)
#            cls_output = outputs[1]
#            cls_output = self.classifier(cls_output)

#            return cls_output
        
        
#with strategy.scope():        
#    model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

In [18]:
#Define f1 functions for evaluation
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

In [20]:
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32,
                                           name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00002), metrics=['accuracy', f1_m, precision_m, recall_m])
    return model

In [21]:
def get_callbacks():
    
    best_model = tf.keras.callbacks.ModelCheckpoint(
        f'best_model.h5',
        verbose=1, 
        monitor='val_loss', 
        mode='min', 
        save_best_only=True, 
        save_weights_only=True
    )
    
    last_model = tf.keras.callbacks.ModelCheckpoint(
        f'last_model.h5',
        verbose=1, 
        save_best_only=False, 
        save_weights_only=True
    )
    
    callbacks = [best_model , last_model]
    
    return callbacks

In [23]:
use_tpu = True
if use_tpu:
    # Create distribution strategy
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model()
else:
    model = create_model()
    
model.summary()

2022-11-29 03:48:42.920693: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-11-29 03:48:42.920834: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}
2022-11-29 03:48:42.926151: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job worker -> {0 -> 10.0.0.2:8470}
2022-11-29 03:48:42.926339: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:301] Initialize GrpcChannelCache for job localhost -> {0 -> localhost:30020}


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 335141888 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 16)                16400     
Total params: 335,158,288
Trainable params: 335,158,288
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.load_weights('../input/social-media-ipd/best_model.h5')

In [25]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [26]:
train_data['type_index'] = data['type'].apply(get_type_index)
train_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,type,posts,type_index
984,INTP,phrases i never want to hear again a k a if yo...,11
6706,INTP,yeah you say you primarily value people who ...,11
143,ENFP,63915 i got my hair cut d some people say t...,1
4844,INFP,as far as i live in this world i ve never bee...,9
4388,ISFP,meh it s overplayed ya but still its good n...,13
...,...,...,...
4373,INFP,hey it seems like you have a great foundatio...,9
7891,INFJ,dear istj mother when i started my very fi...,8
4859,INTP,oh entjs how can you be scary and exciting a...,11
3264,ENFJ,hi entp and welcome to the forum wink f...,0


In [27]:
one_hot_labels = tf.keras.utils.to_categorical(train_data.type_index.values, num_classes=16)

In [28]:
model.fit(np.array(train_input_ids), one_hot_labels, verbose = 1, epochs = NR_EPOCHS, batch_size = BATCH_SIZE,callbacks=get_callbacks(), validation_split=0.25)

Epoch 1/10

Epoch 00001: val_loss improved from inf to 1.25508, saving model to best_model.h5

Epoch 00001: saving model to last_model.h5
Epoch 2/10

Epoch 00002: val_loss did not improve from 1.25508

Epoch 00002: saving model to last_model.h5
Epoch 3/10

Epoch 00003: val_loss did not improve from 1.25508

Epoch 00003: saving model to last_model.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 1.25508

Epoch 00004: saving model to last_model.h5
Epoch 5/10

Epoch 00005: val_loss did not improve from 1.25508

Epoch 00005: saving model to last_model.h5
Epoch 6/10

Epoch 00006: val_loss did not improve from 1.25508

Epoch 00006: saving model to last_model.h5
Epoch 7/10

Epoch 00007: val_loss did not improve from 1.25508

Epoch 00007: saving model to last_model.h5
Epoch 8/10

Epoch 00008: val_loss did not improve from 1.25508

Epoch 00008: saving model to last_model.h5
Epoch 9/10

Epoch 00009: val_loss did not improve from 1.25508

Epoch 00009: saving model to last_model.h5
Epoch 

<tensorflow.python.keras.callbacks.History at 0x7facaea79c50>

In [29]:
train_data['type'].value_counts()

INFP    1476
INFJ    1175
INTP    1027
INTJ     852
ENTP     547
ENFP     536
ISTP     275
ISFP     224
ENTJ     196
ISTJ     181
ENFJ     147
ISFJ     131
ESTP      73
ESFP      40
ESTJ      31
ESFJ      29
Name: type, dtype: int64

In [30]:
test_data['type'].value_counts()

INFP    356
INFJ    295
INTP    277
INTJ    239
ENFP    139
ENTP    138
ISTP     62
ISFP     47
ENFJ     43
ISFJ     35
ENTJ     35
ISTJ     24
ESTP     16
ESFJ     13
ESFP      8
ESTJ      8
Name: type, dtype: int64

**Run test and evaluate accuracy**

In [31]:
test_data['type_index'] = data['type'].apply(get_type_index)
test_data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,type,posts,type_index
4587,ISFP,dear isfj mother i wish you were less of a w...,13
2786,INFJ,to me i think you guys may be over analyzing...,8
2813,ENFP,nihm while nihm has her intj husband i ve go...,1
3705,INTP,i want 5 kids an astro nuclear theoretical...,11
5957,ISFP,i have the same thing as well i ve noticed t...,13
...,...,...,...
418,INFJ,yes i do it all the time i just throw up my...,8
6712,ISFP,hah yes that would be fantastic d d...,13
6390,ISFP,the fact that she s an intj too pretty much e...,13
4925,INFJ,to be perfectly honest i don t know why you ...,8


In [33]:
test_labels = tf.keras.utils.to_categorical(test_data.type_index.values, num_classes=16)

In [34]:
model.evaluate(np.array(test_input_ids), test_labels)



[2.281209707260132,
 0.6282420754432678,
 0.6299241781234741,
 0.6515151858329773,
 0.6147727370262146]

**The accuracy is not great, this is because the data inside the dataset is very imbalanced, which causes the huge disparation between training score and testing score (overfitting).**

In [35]:
cols = data['type'].unique()
cols = cols.tolist()

colnames = ['sentence']
colnames = colnames+cols


In [36]:

df_predict = pd.DataFrame(columns = colnames)
sentence = ["'I'm finding the lack of me in these posts very alarming.|||Sex can be boring if it's in the same position often. For example me and my girlfriend are currently in an environment where we have to creatively use cowgirl and missionary. There isn't enough...|||Giving new meaning to 'Game' theory.|||Hello *ENTP Grin*  That's all it takes. Than we converse and they do most of the flirting while I acknowledge their presence and return their words with smooth wordplay and more cheeky grins.|||This + Lack of Balance and Hand Eye Coordination.|||Real IQ test I score 127. Internet IQ tests are funny. I score 140s or higher.  Now, like the former responses of this thread I will mention that I don't believe in the IQ test. Before you banish...|||You know you're an ENTP when you vanish from a site for a year and a half, return, and find people are still commenting on your posts and liking your ideas/thoughts. You know you're an ENTP when you...|||http://img188.imageshack.us/img188/6422/6020d1f9da6944a6b71bbe6.jpg|||http://img.adultdvdtalk.com/813a0c6243814cab84c51|||I over think things sometimes. I go by the old Sherlock Holmes quote.  Perhaps, when a man has special knowledge and special powers like my  own, it rather encourages him to seek a complex...|||cheshirewolf.tumblr.com  So is I :D|||400,000+  post|||Not really; I've never thought of E/I or J/P as real functions.  I judge myself on what I use. I use Ne and Ti as my dominates. Fe for emotions and rarely Si. I also use Ni due to me strength...|||You know though. That was ingenious. After saying it I really want to try it and see what happens with me playing a first person shooter in the back while we drive around. I want to see the look on...|||out of all of them the rock paper one is the best. It makes me lol.  You guys are lucky :D I'm really high up on the tumblr system.|||So did you hear about that new first person shooter game? I've been rocking the hell out of the soundtrack on my auto sound equipment that will shake the heavens. We managed to put a couple PS3's in...|||No; The way he connected things was very Ne. Ne dominates are just as aware of their environments as Se dominates.  Example: Shawn Spencer or Patrick Jane; Both ENTPs.|||Well charlie I will be the first to admit I do get jealous like you do. I chalk it up to my 4w3 heart mixed with my dominate 7w8. 7s and 8s both like to be noticed. 4's like to be known (not the same...|||;D I'll upload the same clip with the mic away from my mouth. Than you won't hear anything.  Ninja Assassin style but with splatter.|||Tik Tok is a really great song. As long as you can mental block out the singer. I love the beat it makes me bounce.|||drop.io v1swck0  :D Mic really close to my mouth and smokin aces: assassins ball playing in the background.|||Sociable =/= extrovert; I'm an extrovert and I'm not sociable. :)|||Sherlock in the movie was an ENTP. Normally he's played as a EXTJ. In the books he's an ESTJ.  As I said. The movie looked good except for it being called sherlock holmes.|||http://i817.photobucket.com/albums/zz96/kamioo/Dirtywinch.png|||Oh, I never had fear of kissing a guy. I will kiss an animal too. So there was nothing to vanish. Just personal taste and me not liking it.  The guy I kissed didn't know me. It was one of those...|||Sounds pretty much like my area and what I'm going through right now trying to figure out which way I want to take my life. I want to do so many things. The biggest problem is that I know if I don't...|||;D I was operating under the impression that you were female. I never looked at your boxy. Okay, I help out my gay friends all the time and one of them has developed a little crush on me. I get red...|||T_T You just described me  and I'm living the worst nightmare. I'm trapped in one place with one one around. Only dull woods. If I was a serial killer this would be the perfect place but sadly I'm...|||TBH, and biased, sounds like a shadowed INFP. I think maybe he was hurt and turned ESTJ. I can tell because he has some of the typical INFP traits left over.|||*Checks list* I'm sorry. It seems that you have came at a bad time. We've already reached our quota of INFJs. However, being you're female and I like females I will make you a deal. I will kick one...|||I'm ANTP (Leaning toward E). I'm easy for both ENTPs and INTPs to identify with. :)|||I also imagine ENTP's interrogations would go a little bit like Jack's from 24 except more mechanical. Rigging up shock treatment equipment in an abandoned building out of an old car batty, jumper...|||It was a compliment :) Trust me. I'm just as psychopathic :D except I have emoticons. They're just weird ones. Like laughing when I get hurt or at people running themselves over with their lawn mower...|||http://i817.photobucket.com/albums/zz96/kamioo/Thunderstorm.pnghttp://i817.photobucket.com/albums/zz96/kamioo/Thunderstormbw.png http://i817.photobucket.com/albums/zz96/kamioo/Cosmicstorm.png|||No. It's like a theme for where I live and that is why I know it by heart.   http://www.youtube.com/watch?v=j5W73HaVQBg|||and I usual don't leave until the thing ends. But in the mean time. In between times. You work your thing. I'll work mine :D  ;D I'm the MBP; Pleasure to meet you.|||Damn, need to trust my instincts more I would have been closer I was going to say INFP.|||EXFP? Leaning toward S with the way she responded.  :D My friends, even my gay and lesbian ones, always come to me for advice.|||I bow to my entp masters ENTPs are so great. If it wasn't for ENTPs I wouldn't have been able to build what I'm building  Duck Duck  Duck  Shotgun|||What? Me? I never do that >.> <.<|||Because its hard to be sad about losing someone you like when you knew you were right and give yourself a big pat on the back because you're awesome and always correct.|||Oh, you don't have to tell me that most of them are stupid. I know this. That is why I play with them and it makes me laugh. :D As I'm going to take Neuropsychology and I have a few psychologist...|||:D I'm a Nightowl. I wake up between 6-7pm and stay awake till 10-11:30am.|||Personal opinion backed by theory would suggest that INTPs are the most socially difficult. While INTJs can be socially indifferent but they will also use social situations if the the need arises....|||Personal stocks that I have on my desktop that I've downloaded from random stock sites and stock photobuckets.|||I'll tell you when I open photoshop.  :) Glad you like it static.|||:D Thanks.|||http://i817.photobucket.com/albums/zz96/kamioo/Deathgrip.png http://i817.photobucket.com/albums/zz96/kamioo/Deathgripbw.png  Made for a friend. Several hours of work. I constructed every line by...|||:) Static: http://i817.photobucket.com/albums/zz96/kamioo/Statickitten.png  I'll have to get to your avatar later if one of my fellow teammates doesn't.|||Psychologist don't keep me around long enough to diagnosis me. I like to toy with them. What I have diagnosis myself with and had a few psychologist friends (+ a few other friends) tell me I have is...'"]
sentence_inputs = tokenize_sentences(sentence, tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
sentence_inputs

prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0,'sentence']=sentence
df_predict.loc[0, cols] = prediction

  0%|          | 0/1 [00:00<?, ?it/s]

In [37]:
sentence_inputs

array([[  101,  1005,  1045,  1005,  1049,  4531,  1996,  3768,  1997,
         2033,  1999,  2122,  8466,  2200,  8598,  2075,  1012,  1064,
         1064,  1064,  3348,  2064,  2022, 11771,  2065,  2009,  1005,
         1055,  1999,  1996,  2168,  2597,  2411,  1012,  2005,  2742,
         2033,  1998,  2026,  6513,  2024,  2747,  1999,  2019,  4044,
         2073,  2057,  2031,  2000,  5541,  2135,  2224, 11190, 15239,
         1998,  8696,  1012,  2045,  3475,  1005,  1056,  2438,  1012,
         1012,  1012,  1064,  1064,  1064,  3228,  2047,  3574,  2000,
         1005,  2208,  1005,  3399,  1012,  1064,  1064,  1064,  7592,
         1008,  4372, 25856,  5861,  1008,  2008,  1005,  1055,  2035,
         2009,  3138,  1012,  2084,  2057, 23705,  1998,  2027,  2079,
         2087,  1997,  1996, 20661,  2096,  1045, 13399,  2037,  3739,
         1998,  2709,  2037,  2616,  2007,  5744,  2773, 13068,  1998,
         2062,  5048,  2100, 20237,  1012,  1064,  1064,  1064,  2023,
      

In [38]:
df_predict

Unnamed: 0,sentence,INFJ,ENTP,INTP,INTJ,ENTJ,ENFJ,INFP,ENFP,ISFP,ISTP,ISFJ,ISTJ,ESTP,ESFP,ESTJ,ESFJ
0,['I'm finding the lack of me in these posts ve...,1.8e-05,5.7e-05,0.000109,0.998233,1.6e-05,6.4e-05,2.3e-05,1e-05,9e-06,0.000131,0.000124,0.001141,5e-06,4e-06,4.8e-05,1e-05


In [39]:
df_predict = pd.DataFrame(columns = colnames)
sentence_inputs = tokenize_sentences([test_data['posts'].tolist()[1]], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
prediction = model.predict(np.array(sentence_inputs))
df_predict.loc[0, 'sentence'] = test_data['posts'].tolist()[1]
df_predict.loc[0, cols] = prediction

df_predict

  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,sentence,INFJ,ENTP,INTP,INTJ,ENTJ,ENFJ,INFP,ENFP,ISFP,ISTP,ISFJ,ISTJ,ESTP,ESFP,ESTJ,ESFJ
0,to me i think you guys may be over analyzing...,4e-06,0.0,0.0,0.0,4e-06,0.0,0.0,0.0,0.999983,2e-06,1e-06,0.0,5e-06,0.0,0.0,0.0


**Test the model to predict a single sentence. 

In [40]:
sentence_inputs

array([[  101,  2000,  2033,  1045,  2228,  2017,  4364,  2089,  2022,
         2058, 20253,  2009,  1037,  2978,  1045,  2113,  1045,  2079,
         2023,  2411,  2007,  2814,  3251,  1045,  2812,  2000,  2030,
         2025,  1045,  2130,  2788,  5382,  2008,  1045,  2079,  2009,
         1998,  2064,  1056,  2393,  2870,  2061,  2144,  2023,  4165,
         1045,  5791,  2079,  2025,  2228,  2009,  2003,  2074,  2019,
         4372, 25856,  2518,  1045,  2572,  1999,  2546,  3501,  1998,
         2031,  4188,  2000,  5376,  2083, 27669,  2015,  2005,  3087,
         2005,  2087,  1997,  2026,  2166,  1045,  2001,  2992,  2000,
         3582,  2035,  1997,  2122,  5236,  2210,  3513,  1997,  2591,
         2005,  2028,  1997,  1996,  8817,  1997,  5097,  1045,  2031,
         7864,  2000,  2131,  3858,  1998,  2000,  2131,  5086,  1998,
         2980,  7967,  1999,  2392,  1997,  1037,  5016, 28698, 17012,
         1045,  2031,  2196,  2657,  1997,  2008,  2021,  2009,  2515,
      