# Demo on BERT : case study on sentence classification

# Installing the transformers library

In [None]:
#!pip install transformers

# Uograding transformers library

In [None]:
#!pip install --upgrade tensorflow to v2.4.1

# Import the requried libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [3]:
df = pd.read_csv(r'C:\Users\meenakshi.h\Desktop\Machine Translation\Data\SST2\train.tsv', delimiter='\t', header=None)

 subset of the original dataset is selected for implementation

In [4]:
data = df[:2000]

In [5]:
data[1].value_counts()

1    1041
0     959
Name: 1, dtype: int64

# Loading the Pre-trained BERT model

In [8]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (tns.DistilBertModel, tns.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Preprocessing the dataset

# Tokenization<br>
tokenize the word suitable for  BERT format 

In [9]:
tokenized = data[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

# Padding <br>
BERT can take the all the tokens at once to process.<br>
paddin is used tokeep all list of token to be of same size <br>


In [None]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

Print the dimension of the padded data

In [11]:
np.array(padded).shape

(2000, 59)

# Masking<br>
create an attention mask to make the model to ignore or mask the padding we've added when it's processing its input

In [12]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 59)

# Model building <br>
call a  model() function to run BERT over preprocessed data.<br>


In [14]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [15]:
features = last_hidden_states[0][:,0,:].numpy()

Get the label information

In [17]:
labels = data[1]

split our datset into a training set and testing set

In [18]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

train the LogisticRegression model.

In [19]:
lr = LogisticRegression()
lr.fit(train_features, train_labels)

LogisticRegression()

Evaluate the model

In [20]:
lr.score(test_features, test_labels)

0.846