In [None]:
import numpy as np
import pandas as pd

### import dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
import sklearn
from tqdm import tqdm
df= pd.read_csv('/content/drive/MyDrive/4-1_Semester/practice code/Sentiment Analysis/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment'].value_counts()

NameError: ignored

In [None]:
%matplotlib inline
df['sentiment'].value_counts().plot(kind='bar')

NameError: ignored

In [None]:
##install Transformers Library
!pip install transformers

In [None]:
# Loading the BERT Classifier and Tokenizer along with Input module
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model.summary()

### # Change positive/negative into numeric

In [None]:
df.sentiment = df.sentiment.apply(lambda x: 0 if x =='negative' else 1)
df.sample()
df.head()

In [None]:
df['sentiment'].value_counts()

In [None]:
%matplotlib inline
df['sentiment'].value_counts().plot(kind='bar')

In [None]:
train = df[:45000]
test = df[45000:]

### Example

In [None]:
##BERT tokenizer examples

example='A quick brown fox jumps over the lazy dog'
tokens=tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

### Data processing

1. Add special tokens to separate sentences and do classification
2. Pass sequences of constant length (introduce padding)
3. Create array of 0s (pad token) and 1s (real token) called attention mask

In [None]:
## convert_data_to_examples: will accept train/test dataset and convert each row into an InputExamples object.
def convert_data_to_examples(train, test, review, sentiment): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)

    test_InputExamples = test.apply(lambda x: InputExample(guid=None,
                                                          text_a = x[review], 
                                                          label = x[sentiment]), axis = 1)
  
    return train_InputExamples, test_InputExamples

train_InputExamples, test_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')

In [None]:
 #train_InputExamples[0]

In [None]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"],
                                                     input_dict['attention_mask'])
        
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, 
                                      token_type_ids=token_type_ids, label=e.label) )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'

In [None]:
#convert_examples_to_tf_dataset: will tokenize the InputExample objects, then create the required input format with the 
#                                tokenized objects, finally, create an input dataset that we can feed to the model.

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(50).batch(32).repeat(2)

100%|██████████| 45000/45000 [04:50<00:00, 154.89it/s]


In [None]:
test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
test_data = test_data.batch(32)

100%|██████████| 5000/5000 [00:29<00:00, 166.72it/s]


### Fed to the Model

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=test_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f36c48934d0>

### Test Model

In [None]:
pred_sentences = ['worst movie of my life, will never watch movies from this series',
                  'I was going to say something awesome or great or good, but the movie was so bad',
                  'I loved this movie',
                  'The first half was so bad, but i loved the 2nd part']

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model

tf_outputs = model(tf_batch)                                  

tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.

labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)

label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])

worst movie of my life, will never watch movies from this series :  Negative
I was going to say something awesome or great or good, but the movie was so bad :  Negative
I loved this movie :  Positive
The first half was so bad, but i loved the 2nd part :  Negative
