Installation

In [4]:
!pip install transformers



In [36]:
from tqdm.notebook import tqdm
import tensorflow as tf
import tensorflow.keras
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import regex as re
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import pandas as pd


Reading data

In [34]:
data=pd.read_csv('mbti_1.csv', encoding='ISO-8859-1')
data.head()

Unnamed: 0,type,posts
0,INFJ,http://www.youtube.com/watch?v=qsXHcwe3krw|||h...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...



cleaning the data

In [35]:
data['posts'] = data['posts'].str.lower()
data['Posts'] = data['posts'].apply(lambda x: re.sub(r'https?://[^\s<>"]+|www\.[^\s<>"]+', ' ', x.replace('|','')))
#df['Posts']=df['Posts'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
#df['Posts']=df['Posts'].apply(lambda x: re.sub(r'\w*\d\w*', '', x))
data['Posts']=data['Posts'].apply(lambda x: re.sub(r'[^0-9a-z]', ' ', x))
data.drop('posts',inplace=True,axis=1)
data.head()
#data.posts = clean_text(data)
data

Unnamed: 0,type,Posts
0,INFJ,and intj moments sportscenter not top te...
1,ENTP,i m finding the lack of me in these posts ver...
2,INTP,good one course to which i say i ...
3,INTJ,dear intp i enjoyed our conversation the o...
4,ENTJ,you re fired that s another silly misconcepti...
...,...,...
8670,ISFP,just because i always think of cats as fi d...
8671,ENFP,so if this thread already exists someplace ...
8672,INTP,so many questions when i do these things i ...
8673,INFP,i am very conflicted right now when it comes ...


BERT tokenizer and Attention Mask

In [37]:
posts = data['Posts'].values
labels =  data['type'].values
train_data, test_data = train_test_split(data, random_state=0, test_size=0.3)


In [39]:
pretrained_model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, do_lower_case=True)
MAX_LEN = 1000

def tokenize_sentences(sentences, tokenizer, max_seq_len = 1000):
    tokenized_sentences = []
    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens = True, max_length = max_seq_len)
        tokenized_sentences.append(tokenized_sentence)  
    return tokenized_sentences

def create_attention_masks(preprocessed_sentences):
    attention_masks = []
    for sentence in preprocessed_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)
    return np.asarray(attention_masks)

train_input = tokenize_sentences(train_data['Posts'], tokenizer, MAX_LEN)
train_input = pad_sequences(train_input, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
train_attention_masks = create_attention_masks(train_input)

test_input = tokenize_sentences(test_data['Posts'], tokenizer, MAX_LEN)
test_input = pad_sequences(test_input, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
test_attention_masks = create_attention_masks(test_input)

  0%|          | 0/6072 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


  0%|          | 0/2603 [00:00<?, ?it/s]

# BERT Model
* Load the pretrained BERT base-model from Transformers library
- Take the first hidden-state from BERT output (corresponding to CLS token) and feed it into a Dense layer with 16 neurons and softmax activation

In [None]:

#from transformers import TFBertModel

#from tensorflow.keras.layers import Dense, Flatten

#class BertClassifier(tf.keras.Model):    
#        def __init__(self, bert: TFBertModel, num_classes: int):
#            super().__init__()
#            self.bert = bert
#            self.classifier = Dense(16, activation='softmax')

#        @tf.function
#        def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
#            outputs = self.bert(input_ids,
#                                   attention_mask=attention_mask,
#                                   token_type_ids=token_type_ids,
#                                   position_ids=position_ids,
#                                   head_mask=head_mask)
#            cls_output = outputs[1]
#            cls_output = self.classifier(cls_output)

#            return cls_output
        
        
#with strategy.scope():        
#    model = BertClassifier(TFBertModel.from_pretrained(bert_model_name), len(label_cols))

In [40]:
import transformers
def create_model(): 
    input_word_ids = tf.keras.layers.Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    bert_layer = transformers.TFBertModel.from_pretrained('bert-large-uncased')
    bert_outputs = bert_layer(input_word_ids)[0]
    pred = tf.keras.layers.Dense(16, activation='softmax')(bert_outputs[:,0,:])
    model = tf.keras.models.Model(inputs=input_word_ids, outputs=pred)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.00002), metrics=['accuracy'])
    return model

In [41]:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

with strategy.scope():
  model = create_model()  
model.summary()

INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.


INFO:tensorflow:Deallocate tpu buffers before initializing tpu system.






INFO:tensorflow:Initializing the TPU system: grpc://10.49.62.138:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.49.62.138:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer)  [(None, 1000)]           0         
                                                                 
 tf_bert_model (TFBertModel)  TFBaseModelOutputWithPoo  335141888
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             1000, 1024),                        
                              pooler_output=(None, 10            
                             24),                                
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                          
                                                             

In [42]:
types = np.unique(data.type.values)

def get_type_index(string):
    return list(types).index(string)

In [48]:

train_data['type_index'] = data['type'].apply(get_type_index)
one_hot_labels = tf.keras.utils.to_categorical(train_data.type_index.values, num_classes=16)
test_data['type_index'] = data['type'].apply(get_type_index)
test_labels = tf.keras.utils.to_categorical(test_data.type_index.values, num_classes=16)


In [50]:

model.fit(np.array(train_input), one_hot_labels, verbose = 1, epochs = 20, batch_size = 16,  callbacks = [tf.keras.callbacks.EarlyStopping(patience = 5)])

Epoch 1/20












Epoch 2/20




Epoch 3/20




Epoch 4/20




Epoch 5/20




Epoch 6/20




Epoch 7/20




Epoch 8/20




Epoch 9/20




Epoch 10/20




Epoch 11/20




Epoch 12/20




Epoch 13/20




Epoch 14/20




Epoch 15/20




Epoch 16/20




Epoch 17/20




Epoch 18/20




Epoch 19/20




Epoch 20/20






<keras.callbacks.History at 0x7f6713d82390>

In [None]:
model.evaluate(np.array(test_input), test_labels)

 1/82 [..............................] - ETA: 31:47 - loss: 1.8768 - accuracy: 0.7188

In [None]:
cols = data['type'].unique()
cols = cols.tolist()
colnames = ['sentence']
colnames = colnames+cols


In [None]:
#Test the model to predict a single sentence. 
model_path = "mbti_bert_model0414"
model.save('bert_model1.h5')

In [None]:
#Prediction
df_prediction = pd.read_csv('ASUS_fans_cleaned.csv', encoding='ISO-8859-1')

df_prediction.dropna(inplace = True)
sentence_inputs = tokenize_sentences(df_prediction['Posts'], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
prediction = model.predict(np.array(sentence_inputs))


  0%|          | 0/232 [00:00<?, ?it/s]

In [None]:
df_prediction1 = pd.read_csv('result_Apple1.csv', encoding='ISO-8859-1')
df_prediction1.drop(['sentiment'], axis = 1,inplace = True)
df_prediction1.dropna(inplace = True)
df_prediction1.head()
sentence_inputs = tokenize_sentences(df_prediction1['Posts'], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

  0%|          | 0/1563 [00:00<?, ?it/s]

In [None]:
prediction1 = model.predict(np.array(sentence_inputs))

In [None]:
df_prediction.loc[:, cols] = prediction
df_prediction.to_csv('ASUS_result_Mbti3.csv')

In [None]:
df_prediction.loc[:, cols] = prediction

In [None]:
df_prediction.to_csv('user_result.csv')

In [None]:
# new_model = create_model()
# #new_model = tf.keras.models.load_model('bert_model.h5')
# new_model.load_weights('bert_model.h5')
# # Check its architecture
# new_model.summary()

In [None]:
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
bert_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(bert_model_name, do_lower_case=True)
MAX_LEN = 1000

new_model = create_model()
#train_list = load_image_list('/content/gdrive/My Drive/bert_model.h5')
#new_model = tf.keras.models.load_model('bert_model.h5')
new_model.load_weights('bert_model.h5')
# Check its architecture
new_model.summary()



Some layers from the model checkpoint at bert-large-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-large-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


OSError: ignored

In [None]:
df_prediction = pd.read_csv('sentiment_Samsung1.csv', encoding='ISO-8859-1')

MAX_LEN = 1000
df_prediction.dropna(inplace = True)
sentence_inputs = tokenize_sentences(df_prediction['Posts'], tokenizer, MAX_LEN)
sentence_inputs = pad_sequences(sentence_inputs, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")


In [None]:
prediction = new_model.predict(np.array(sentence_inputs))
df_prediction.loc[:, cols] = prediction
df_prediction.to_csv('Samsung_result_mbti.csv')

InvalidArgumentError: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
