In [41]:
!pip install transformers

[0m

In [42]:
from transformers import BertTokenizer, TFBertForSequenceClassification

# initialize our model to support 3 classes (Positive, Negative and Neutral)
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
import pandas as pd
import tensorflow as tf

#!pip uninstall numpy -y
#!pip install --upgrade pip
#!pip install numpy

from sklearn.model_selection import train_test_split

In [44]:
from sklearn.preprocessing import LabelEncoder

# Loading data
data = pd.read_csv('sentiment_data.csv')

label_encoder = LabelEncoder()
data['Sentiment'] = label_encoder.fit_transform(data['Sentiment'])

print("Total entries: ", data.shape[0])

# 0 - Negative | 1 - Neutral | 2 - Positive
print(data['Sentiment'].value_counts())

Total entries:  5842
1    3130
2    1852
0     860
Name: Sentiment, dtype: int64


In [45]:
# Spliting into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.20)

In [46]:
# Creating tokens of each sentence in sentences
def tokenize_sentences(sentences):
    input_ids, token_type_ids, attention_masks = [], [], []
    for sentence in sentences:
        encoded = tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=True,
            truncation=True
        )
        input_ids.append(encoded['input_ids'])
        token_type_ids.append(encoded['token_type_ids'])
        attention_masks.append(encoded['attention_mask'])
    return input_ids, token_type_ids, attention_masks

example="My name is Pulkit Agarwal"
input_ids, token_type_ids, attention_masks = tokenize_sentences([example])
print("input_ids: ", input_ids)
print("token_type_ids: ", token_type_ids)
print("attention_masks: ", attention_masks)

input_ids:  [[101, 2026, 2171, 2003, 16405, 13687, 4183, 12943, 2906, 13476, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
token_type_ids:  [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
attention_masks:  [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [47]:
def tokenize_and_create_dataset(data, batch_size):
    input_ids, token_type_ids, attention_masks = tokenize_sentences(data['Sentence'])
    dataset = tf.data.Dataset.from_tensor_slices((
        {
          'input_ids': input_ids,
          'token_type_ids': token_type_ids,
          'attention_mask': attention_masks
        },
        data['Sentiment']
    )).batch(batch_size)
    return dataset

train_dataset = tokenize_and_create_dataset(train_data, batch_size=32)
val_dataset = tokenize_and_create_dataset(val_data, batch_size=32)

In [48]:
#optimizer = AdamWeightDecay(lr=3e-5)
#loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#metrics = ['accuracy']
#model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# CP for accuracy
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [53]:
# using epoc = 1 because of time xD
history = model.fit(train_dataset, validation_data=val_dataset)



In [54]:
val_acc = history.history['val_accuracy'][-1]

print("Validation accuracy: ", val_acc)

Validation accuracy:  0.7570573091506958


In [55]:
def predict(sentence, model, tokenizer):
    #reuse the token creation code for now
    encoded = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_token_type_ids=True,
        truncation=True
    )
    input_ids = tf.expand_dims(encoded['input_ids'], 0)
    token_type_ids = tf.expand_dims(encoded['token_type_ids'], 0)
    attention_mask = tf.expand_dims(encoded['attention_mask'], 0)

    logits = model.predict({
        'input_ids': input_ids,
        'token_type_ids': token_type_ids,
        'attention_mask': attention_mask
    })[0]
    probabilities = tf.nn.softmax(logits)
    prediction = tf.argmax(probabilities, axis=-1).numpy()[0]
    return prediction

sentence = val_data['Sentence'].iloc[1]
prediction = predict(sentence, model, tokenizer)



In [56]:
print("Predictions: 0 -> Negative | 1 -> Neutral | 2 -> Positive")
print("Predicted sentiment: ", prediction)
actual_label = val_data['Sentiment'].iloc[1]
print("Actual sentiment: ", label)

Predictions: 0 -> Negative | 1 -> Neutral | 2 -> Positive
Predicted sentiment:  1
Actual sentiment:  negative
