### Sentiment Analysis with BERT (Transfer learning, without fine-tune)
#### data source: Amazon Fine Food Reviews
#### Ricardo Flores

### Step 0: Cargar librerias

In [1]:
import pandas as pd
import numpy as np
import random
import re

from sklearn.model_selection import train_test_split
from sklearn.utils import resample

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# plots
import seaborn as sns
import matplotlib.pyplot as plt

# BERT (transformers)
from transformers import pipeline



In [3]:
def metrics_report(y_test, y_pred):
    # metrics 
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    return precision, recall, f1, acc

### Steps 1: Datos

In [4]:
# get data 
df_review = pd.read_csv('./data/Reviews.csv')

In [5]:
# Select relevant columns 
df_review = df_review[['Id', 'Score', 'Text']]
df_review

Unnamed: 0,Id,Score,Text
0,127276,5,There is something to be said about this candy...
1,395444,5,"In many languages around the world, ""chai"" sim..."
2,225935,1,"Review of Asian Taste Dried Mushroom, 5-Ounce ..."
3,562268,5,Great flavor. I have always ordered Blue Diamo...
4,491584,3,The product came with a crushed box but was OK...
...,...,...,...
5680,407146,4,For the amount of money spent on this product ...
5681,323379,5,I love this product. It's really a great way t...
5682,455703,5,Wow! This jerky is delicious! I have purchased...
5683,563011,5,The soft baked cookies are amazingly delicious...


In [7]:
# create labels 
label = []
for i in df_review.Score:
    if i > 3:
        label.append(1) # positive 
    else:
        label.append(0) # negative
df_review["Label"] = label
df_review   

Unnamed: 0,Id,Score,Text,Label
0,127276,5,There is something to be said about this candy...,1
1,395444,5,"In many languages around the world, ""chai"" sim...",1
2,225935,1,"Review of Asian Taste Dried Mushroom, 5-Ounce ...",0
3,562268,5,Great flavor. I have always ordered Blue Diamo...,1
4,491584,3,The product came with a crushed box but was OK...,0
...,...,...,...,...
5680,407146,4,For the amount of money spent on this product ...,1
5681,323379,5,I love this product. It's really a great way t...,1
5682,455703,5,Wow! This jerky is delicious! I have purchased...,1
5683,563011,5,The soft baked cookies are amazingly delicious...,1


In [8]:
# split data
data = df_review['Text']
y = df_review['Label']

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify=y, random_state = 124)

print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

Train data: (3979,) (3979,)
Test data: (1706,) (1706,)


In [9]:
# Upsampling for training
train = pd.DataFrame({'text':X_train, 'label':y_train})
#Count 1s and 0s
ones = len(train.loc[train['label'] == 1])
zeros = len(train.loc[train['label'] == 0])
if ones >= zeros:
    majority = 1
    minority = 0
else:
    majority = 0
    minority = 1
    
# Upsample TrainingSet
train_majority = train[train.label==majority]
train_minority = train[train.label==minority]

# Upsample minority class
train_minority_upsampled = resample(train_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(train_majority),    # to match majority class
                                 random_state=42) # reproducible results

# Combine majority class with upsampled minority class
train = pd.concat([train_majority, train_minority_upsampled])
X_train = train['text']
y_train = train['label']
print("Train data:",  X_train.shape, y_train.shape)
print("Test data:",  X_test.shape, y_test.shape)

Train data: (6192,) (6192,)
Test data: (1706,) (1706,)


### Step 2: Modelo

In [10]:
# load the model classifier 
model_pipline = 'distilbert-base-uncased-finetuned-sst-2-english' 
classifier = pipeline('sentiment-analysis', model_pipline)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [11]:
# take time (1 minute aprox)
max_length = 512 
Label_pred = []
Score_pred = []
for i in X_test: 
    results = classifier(i[0:max_length])
    Label_pred.append(results[0]['label'])
    Score_pred.append(results[0]['score'])

In [12]:
# change format of labels prediction, from text to numbers
y_pred = [1 if i=='POSITIVE' else 0 for i in Label_pred]

### Step 3: Resultados

In [13]:
# get metrics
y = y_test.tolist()
metrics1 = metrics_report(y, y_pred)
p1 = metrics1[0]
r1 = metrics1[1]
f1 = metrics1[2]
ac1 = metrics1[3]


In [14]:
# print results
m1 = model_pipline.split('-')[0]
d = {'Model':[m1], 'Precision':[p1], 'Recall':[r1], 'F1':[f1], 'Accuracy':[ac1]}
Bert1_results = pd.DataFrame(d)
Bert1_results

Unnamed: 0,Model,Precision,Recall,F1,Accuracy
0,distilbert,0.939056,0.823795,0.877657,0.821219
