# Sentiment Classification Model by Google apps reviews data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [None]:
GoogleData = pd.read_csv("/content/drive/MyDrive/NLP/Summer Project/sentiment analysis/googleplaystore_user_reviews.csv")
GoogleData = GoogleData.dropna()
GoogleData.reset_index(drop=True, inplace=True)
GoogleData.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
3,10 Best Foods for You,Best idea us,Positive,1.0,0.3
4,10 Best Foods for You,Best way,Positive,1.0,0.3


In [None]:
df = pd.DataFrame()
df['id'] = [i for i in range(len(GoogleData))]
label_list = []
for i in range(len(GoogleData)):
    if GoogleData['Sentiment'][i]=='Negative':
        label_list.append(0)
    elif GoogleData['Sentiment'][i]=='Positive':
        label_list.append(2)
    else:
        label_list.append(1)
        
df['category'] = label_list   
df['text'] = GoogleData['Sentiment']

df.set_index('id', inplace=True)

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.category.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.category.values)

df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'
df.groupby(['category', 'data_type']).count()

In [None]:
df.head()

Unnamed: 0_level_0,category,text,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,Positive,train
1,2,Positive,train
2,2,Positive,train
3,2,Positive,train
4,2,Positive,train


In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].category.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].category.values)

dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)



In [None]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 3,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )


batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)


optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)


epochs = 5

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)


cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')
    
    tqdm.write('\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/7953 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.005781827697304348


  0%|          | 0/176 [00:00<?, ?it/s]

Validation loss: 4.375667660125063e-06
F1 Score (weighted): 1.0


Epoch 2:   0%|          | 0/7953 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 1.8729402772768317e-06


  0%|          | 0/176 [00:00<?, ?it/s]

Validation loss: 1.0199533530894846e-07
F1 Score (weighted): 1.0


Epoch 3:   0%|          | 0/7953 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 5.030757285761207e-08


  0%|          | 0/176 [00:00<?, ?it/s]

Validation loss: 0.0
F1 Score (weighted): 1.0


Epoch 4:   0%|          | 0/7953 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 3.747305496207528e-11


  0%|          | 0/176 [00:00<?, ?it/s]

Validation loss: 0.0
F1 Score (weighted): 1.0


Epoch 5:   0%|          | 0/7953 [00:00<?, ?it/s]


Epoch {epoch}
Training loss: 0.0


  0%|          | 0/176 [00:00<?, ?it/s]

Validation loss: 0.0
F1 Score (weighted): 1.0


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/NLP/Summer Project/sentiment analysis/Bert_weights_sentiment_analysis.pth')

In [None]:
label_dict = {'Negative':0, 'Neutral':1, 'Positive':2}

In [None]:
accuracy_per_class(predictions, true_vals)

# Prediction

In [None]:
import torch
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 3,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )
model.load_state_dict(torch.load('/content/drive/MyDrive/NLP/Summer Project/sentiment analysis/Bert_weights_sentiment_analysis.pth'))
model = model.to('cuda') 
device = 'cuda'

batchsize = 256
batchnum = int(len(GoogleData)/batchsize)+1

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassificatio

In [None]:
p_results = []
prediction_results = []
for i in range(batchnum):
    if i==(batchnum-1):
        n = len(GoogleData)%batchsize
    else:
        n = batchsize
        
    encoded_data_pred = tokenizer.batch_encode_plus(
        GoogleData['Translated_Review'][(i*batchsize):(i*batchsize+n)].values,
        add_special_tokens=True,
        return_attention_mask=True,
        pad_to_max_length=True,
        max_length=256,
        return_tensors='pt'
    )

    input_ids_pred = encoded_data_pred['input_ids']
    attention_masks_pred = encoded_data_pred['attention_mask']
    
    batch = tuple(b.to(device) for b in encoded_data_pred.values())
    inputs = {
            'input_ids': batch[0],
            'token_type_ids': batch[1],
            'attention_mask': batch[2],
            }

    with torch.no_grad():        
        outputs = model(**inputs)

    logits = outputs[0]
    softmax = torch.nn.Softmax(dim=1)
    p_result = softmax(logits).detach().cpu().numpy()
    p_results = p_results + list(p_result[:,1])

    prediction_result = [np.argmax(c) for c in p_result]
    prediction_results = prediction_results + prediction_result
    



In [None]:
GoogleData["sentiment score"] = p_results
GoogleData["sentiment prediction"] = prediction_results

In [None]:
GoogleData.to_excel('/content/drive/MyDrive/NLP/Summer Project/sentiment analysis/GoogleDataWithSentimentPrediction.xlsx')