# Importing Libraries

In [2]:
import pandas as pd
import numpy as np


from sklearn.metrics import confusion_matrix, accuracy_score
import tensorflow as tf

In [2]:
!pip install transformers

[0m

# Importing BERT tokenizer and Sequence Classifier

In [3]:
from transformers import BertTokenizer , TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [4]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


# Reading and analyzing data

In [4]:
import os
import shutil

URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')
path = os.path.join(os.path.dirname(dataset), 'aclImdb')

train_folder = os.path.join(path, 'train')

remove_folder = os.path.join(train_folder, 'unsup')
shutil.rmtree(remove_folder)
# View the final train folder
print(os.listdir(train_folder))

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
['pos', 'neg', 'unsupBow.feat', 'urls_unsup.txt', 'urls_neg.txt', 'urls_pos.txt', 'labeledBow.feat']


In [24]:
#### Reading data and Creating train and test data
train = tf.keras.preprocessing .text_dataset_from_directory('aclImdb/train/', batch_size = 30000, validation_split = .3, 
                                                            subset='training',seed=42)
test  = tf.keras.preprocessing.text_dataset_from_directory('aclImdb/train/', batch_size = 30000, validation_split=.3, subset = 'validation',
                                                          seed = 42)

Found 25000 files belonging to 2 classes.
Using 17500 files for training.
Found 25000 files belonging to 2 classes.
Using 7500 files for validation.


In [25]:
# Retreiving text and label in dataset 

for i in train.take(1):
    text = i[0].numpy()
    label = i[1].numpy()
    
    

In [26]:
### Viweing data using Pandas df
train_df =pd.DataFrame([text,label]).T
train_df.columns =  ['DATA_COLUMN','LABEL_COLUMN']
train_df['DATA_COLUMN'] = train_df['DATA_COLUMN'].str.decode("utf-8")
train_df.head()


Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,Jane Austen would definitely approve of this o...,1
1,Eyeliner was worn nearly 6000 years ago in Egy...,1
2,The same night that I watched this I also watc...,0
3,This is a tepid docu-drama that covers no new ...,1
4,"Freeway Killer, Is a Madman who shoots people ...",0


 Let's do some analysis on the data we have


In [27]:
train_df.shape

(17500, 2)

In [28]:
# Checking for null values
train_df.isnull().sum()

DATA_COLUMN     0
LABEL_COLUMN    0
dtype: int64

In [29]:
train_df.duplicated().sum()

45

In [30]:
#Let's look at the duplicates
train_df.loc[train_df.duplicated(keep=False)].sort_values(by='DATA_COLUMN')

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
5982,A have a female friend who is currently being ...,1
15643,A have a female friend who is currently being ...,1
14908,"Awful, simply awful. It proves my theory about...",0
3291,"Awful, simply awful. It proves my theory about...",0
5292,"Beautiful film, pure Cassavetes style. Gena Ro...",1
...,...,...
17306,"Wow, here it finally is; the action ""movie"" wi...",0
13776,You do realize that you've been watching the E...,0
6909,You do realize that you've been watching the E...,0
4446,this movie begins with an ordinary funeral... ...,0


In [31]:
# Let's remove the duplicates from training dataset
train_df.drop_duplicates(keep='first', inplace=True)
train_df.shape

(17455, 2)

In [32]:
# Let's repeat the same steps for test data
# Retreiving text and label in dataset 

for i in test.take(1):
    text = i[0].numpy()
    label = i[1].numpy()
   

### Viweing data using Pandas df
test_df =pd.DataFrame([text,label]).T
test_df.columns =  ['DATA_COLUMN','LABEL_COLUMN']
test_df['DATA_COLUMN'] = test_df['DATA_COLUMN'].str.decode("utf-8")
test_df.head()

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,I love full house so much that i couldn't live...,1
1,"in one of Neil Simon's best plays. Creaky, cra...",1
2,This was one of the worst movies I have ever s...,0
3,Elvira Mistress Of The Dark (1988): Cassandra ...,1
4,I love this show. I watch all the reruns every...,1


In [33]:
# Checking the class distribution
round(train_df['LABEL_COLUMN'].sum()/len(train_df)*100,2)

49.85

We observe that the class distribution is even

In [34]:
test_df.shape

(7500, 2)

In [35]:
test_df.isnull().sum()

DATA_COLUMN     0
LABEL_COLUMN    0
dtype: int64

In [36]:
test_df.duplicated().sum()

12

In [37]:
#Let's look at the duplicates
test_df.loc[test_df.duplicated(keep=False)].sort_values(by='DATA_COLUMN')

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
7072,After losing the Emmy for her performance as M...,1
2334,After losing the Emmy for her performance as M...,1
6569,Bette Midler is indescribable in this concert....,1
5389,Bette Midler is indescribable in this concert....,1
2665,How has this piece of crap stayed on TV this l...,0
3558,How has this piece of crap stayed on TV this l...,0
7412,"I absolutely hate this programme, what kind of...",0
6010,"I absolutely hate this programme, what kind of...",0
6362,I saw this movie twice. I can't believe Pintil...,0
3233,I saw this movie twice. I can't believe Pintil...,0


In [38]:
# Let's remove the duplicates from test dataset
test_df.drop_duplicates(keep='first', inplace=True)
test_df.shape

(7488, 2)

#### Preprocessing

In [39]:
# Creating input sequences
def data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_input = train.apply(lambda x: InputExample(guid=None, # Global unique ID - not used
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]),
                                                          axis = 1)

  validation_input = test.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), 
                                                          axis = 1)
  
  return train_input, validation_input

In [40]:
  
def examples_to_tokens(examples, tokenizer, max_length=128):
    features = [] # initialize feature list
# Position IDsposition embedding (Absolute pos embedding returned by default (sinusoidal embedding can be used if needed) - it is an optional parameter)
    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,  # Adds special tokens like cls, eos,mask,sep etc   
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True, # Segment IDs  - Binary mask identifying the different sequences in the model.
            return_attention_mask=True, # binary  1 - focus needed , 0 - no focus needed
            padding='max_length',
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"], # numerical rep of input text - actual input to mode 
                                                    input_dict["token_type_ids"], # segment embedding
                                                     input_dict['attention_mask']) # binar

        features.append(InputFeatures( 
                                        input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
                                        label=e.label
                                      )
                        )

    def gen():
        for f in features:
            yield (
                    {
                        "input_ids": f.input_ids,
                        "attention_mask": f.attention_mask,
                        "token_type_ids": f.token_type_ids,
                    },
                    f.label,
                  )

    return tf.data.Dataset.from_generator(
                                            gen,
                                            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, 
                                             tf.int64),
                                            (
                                                {
                                                    "input_ids": tf.TensorShape([None]),
                                                    "attention_mask": tf.TensorShape([None]),
                                                    "token_type_ids": tf.TensorShape([None]),
                                                },
                                                tf.TensorShape([]),
                                            ),
                                        )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [41]:
train_InputExamples, validation_InputExamples = data_to_examples(train_df, test_df, DATA_COLUMN, LABEL_COLUMN)

train_data = examples_to_tokens(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = examples_to_tokens(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

# Training the model

In [44]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=3,validation_data=validation_data)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8248954a90>

## Prediction

In [53]:
test_input = ['Our family (and the entire sold out sneak preview audience) enjoyed "The Guardian". Kevin Costner and Ashton Kutcher gave convincing performances as the fictional helicopter rescue swimmer characters Ben and Jake. After seeing this movie, you can\'t help but imagine how difficult it must be to graduate from the USCG helicopter rescue swimmer school and one day take part in real rescues.<br /><br />Even though this is a fictional movie, it delivered rather convincing virtues of team spirit, dedication and bravery exhibited by all the members of the actual U. S. Coast Guard.<br /><br />The special effects used to create the rescue scenes were incredible. You actually felt like you were taking part in a real rescue.<br /><br />I feel the movie could have been made without the "Hollywood" bar scene (when you see the movie, you might agree) since the real Coast Guard does not condone such behavior. <br /><br />Very entertaining, very action packed, definitely worth seeing. Thank you, U. S. Coast Guard and the REAL helicopter rescue swimmers, "So Others May Live". I\'d highly recommend this movie to everyone.',
              'Might end up being the biggest disappointment that I will see in 2009. I seem to be the rare person who disliked Park\'s Oldboy, but I think that his "Lady Vengeance" and "Sympathy for Mr. Vengeance" are among the best films I\'ve seen in the 2000\'s decade. Therefore, I really was looking forward to see this, especially as it got such positive reviews. Instead, I found the film clichéd, and broke little, if any new ground to the vampire genre. And while I can appreciate a bit of gallows humor in movies like this, I felt Park did this at very inopportune times.<br /><br />Others have compared/contrasted this to "Let the Right One In," and I have to say that "Let the Right One In" was far superior to this one, and was a fresh take on the vampire genre. Sadly, Park\'s take was a tired one.']

test_batch = tokenizer(test_input, max_length=128, padding='max_length', truncation=True, return_tensors='tf')
test_outputs = model(test_batch)
test_predictions = tf.nn.softmax(test_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(test_predictions, axis=1)
label = label.numpy()
for i in range(len(test_input)):
      print(test_input[i], ": \n",  labels[label[i]] ,'\n\n')



Our family (and the entire sold out sneak preview audience) enjoyed "The Guardian". Kevin Costner and Ashton Kutcher gave convincing performances as the fictional helicopter rescue swimmer characters Ben and Jake. After seeing this movie, you can't help but imagine how difficult it must be to graduate from the USCG helicopter rescue swimmer school and one day take part in real rescues.<br /><br />Even though this is a fictional movie, it delivered rather convincing virtues of team spirit, dedication and bravery exhibited by all the members of the actual U. S. Coast Guard.<br /><br />The special effects used to create the rescue scenes were incredible. You actually felt like you were taking part in a real rescue.<br /><br />I feel the movie could have been made without the "Hollywood" bar scene (when you see the movie, you might agree) since the real Coast Guard does not condone such behavior. <br /><br />Very entertaining, very action packed, definitely worth seeing. Thank you, U. S. C