In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

#This code was modified from https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb#scrollTo=iCoyxRJ7ECTA

In [2]:
# For DistilBERT:
#model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
df_rest = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')
X_train, X_test, y_train, y_test = train_test_split(df_rest['Review'], df_rest['Liked'], test_size=0.05, random_state=98)

tokenized = df_rest['Review'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
        
tokenized = X_train.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [4]:

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

np.array(padded).shape

(950, 43)

In [5]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(950, 43)

In [6]:
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [7]:
Train_features = last_hidden_states[0][:,0,:].numpy()

In [8]:
lr_clf = LogisticRegression()
lr_clf.fit(Train_features, y_train)

LogisticRegression()

In [9]:
tokenized = X_test.apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

print(padded.shape)
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
    
Test_features = last_hidden_states[0][:,0,:].numpy()

(50, 43)


In [10]:
lr_clf.score(Test_features, y_test)

0.9

In [11]:
def prRed(skk): print("\033[91m {}\033[00m" .format(skk))
def prGreen(skk): print("\033[92m {}\033[00m" .format(skk))

fp = 0
fn = 0

for review, liked in zip(X_test,y_test):
    tokenized = tokenizer.encode(review, add_special_tokens=True)
    padded = np.array([tokenized + [0]*(max_len-len(tokenized))])
    attention_mask = np.where(padded != 0, 1, 0)
    input_ids = torch.tensor(padded)  
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)
    feature = last_hidden_states[0][:,0,:].numpy()
    
    result = lr_clf.predict(feature)
    if ((liked == 1) & (result == 0)):
        prGreen(str(review) + " Favorable Review, Unfavorable Prediction (FN) "+str(liked) + str(result))
        fn += 1
    elif ((liked == 0) & (result == 1)):
        prRed(str(review) + " Unfavorable Review, Favorable Prediction (FP) "+str(liked) + str(result))
        fp += 1
    else: print(str(review) + " "+str(liked) + str(result))

print("False Positives: ", fp, " False Negatives: ", fn)

Everything was gross. 0[0]
Thus far, have only visited twice and the food was absolutely delicious each time. 1[1]
[91m This place has a lot of promise but fails to deliver. Unfavorable Review, Favorable Prediction (FP) 0[1][00m
I would avoid this place if you are staying in the Mirage. 0[0]
The sangria was about half of a glass wine full and was $12, ridiculous. 0[0]
Their frozen margaritas are WAY too sugary for my taste. 0[0]
This place is not quality sushi, it is not a quality restaurant. 0[0]
I had the opportunity today to sample your amazing pizzas! 1[1]
The selection of food was not the best. 0[0]
I've had better atmosphere. 0[0]
Before I go in to why I gave a 1 star rating please know that this was my third time eating at Bachi burger before writing a review. 0[0]
I took back my money and got outta there. 0[0]
Anyways, The food was definitely not filling at all, and for the price you pay you should expect more. 0[0]
Our server was super nice and checked on us many times. 1[1]