# Predicting sentiment of smartphone reviews from Ukrainian eCommerce marketplace

# Build BERT model v2: oversample negative class to remove class imbalance in target variable

## Read clean_data and preprocess data

In [40]:
#Import necessary libraries
from datetime import datetime
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import dateparser
from datetime import timedelta
from datetime import date
import ktrain
from ktrain import text
import advertools as adv
from langdetect import detect
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

%config IPCompleter.greedy=True

#Remove limitations for displaying of tables in pandas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [41]:
data_clean = pd.read_csv('./reviews_data/data_clean.csv')
print(data_clean.head())

                                                                                                 comment_link  \
0  https://rozetka.com.ua/ua/samsung_galaxy_s21_phantom_black_sm_g998bzkgsek/p272562736/comments/#id=49190491   
1  https://rozetka.com.ua/ua/samsung_galaxy_s21_phantom_black_sm_g998bzkgsek/p272562736/comments/#id=49299159   
2  https://rozetka.com.ua/ua/samsung_galaxy_s21_phantom_black_sm_g998bzkgsek/p272562736/comments/#id=49228123   
3  https://rozetka.com.ua/ua/samsung_galaxy_s21_phantom_black_sm_g998bzkgsek/p272562736/comments/#id=49212385   
4  https://rozetka.com.ua/ua/samsung_galaxy_s21_phantom_black_sm_g998bzkgsek/p272562736/comments/#id=49285480   

                                                                                                                 product_title  \
0   Мобільний телефон Samsung Galaxy S21 Ultra 12/256 GB Phantom Black (SM-G998BZKGSEK) + Сертифiкат на 4000 грн у подарунок!    
1   Мобільний телефон Samsung Galaxy S21 Ultra 12/256 GB Phan

In [42]:
pos_features = data_clean[data_clean['review_sentiment']==1][['full_text', 'review_sentiment']]
neg_features = data_clean[data_clean['review_sentiment']==0][['full_text', 'review_sentiment']]
neg_features = neg_features.sample(len(pos_features), replace=True)
len(neg_features)

2925

In [43]:
resampled_data = data_clean[data_clean['review_sentiment']==1][['full_text', 'review_sentiment']].append(neg_features)

In [44]:
#Encode target variable (format required for the modeling)
target = pd.get_dummies(resampled_data.review_sentiment)
resampled_data['0'] = target.iloc[:, 0]
resampled_data['1'] = target.iloc[:, 1]

In [45]:
resampled_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5850 entries, 4 to 1387
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   full_text         5850 non-null   object
 1   review_sentiment  5850 non-null   int64 
 2   0                 5850 non-null   uint8 
 3   1                 5850 non-null   uint8 
dtypes: int64(1), object(1), uint8(2)
memory usage: 148.5+ KB


In [46]:
### Preprocessing: train/test split and preprocessing pipeline with Bert mode

For training the model we will use the column full_text which contains all text from reviews (review_text, product_advatages, product_disadvantages). 

In [47]:
#Split dataset into training and test. Test dataframe can be used later to check model performance on the unseen data.
#Add stratify parameter to ensure the same proportions of examples in each class as observed in the original dataset.
df_train, df_test = train_test_split(resampled_data, test_size=0.2, stratify=resampled_data.review_sentiment)

In [48]:
#Check proportions of examples in each class for training and test dataset.
display(round(df_test.groupby('review_sentiment').count()/df_test.shape[0], 2))
display(round(df_train.groupby('review_sentiment').count()/df_train.shape[0], 2))

Unnamed: 0_level_0,full_text,0,1
review_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.5,0.5,0.5
1,0.5,0.5,0.5


Unnamed: 0_level_0,full_text,0,1
review_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.5,0.5,0.5
1,0.5,0.5,0.5


In [49]:
#Data preprocessing with bert mode
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(df_train, 
                                                                   'full_text', # name of column containing review text
                                                                   label_columns=['0', '1'],
                                                                   maxlen=200, 
                                                                   max_features=100000,
                                                                   preprocess_mode='bert',
                                                                   val_pct=0.1)

preprocessing train...
language: ru


Is Multi-Label? False
preprocessing test...
language: ru


## Building Bert model with ktrain

In [50]:
#Initializing pre-trained Bert model
model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

Is Multi-Label? False
maxlen is 200
done.


In [52]:
#Wrap the model and data in a Learner object
learner = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

In [53]:
#Set learner to use default weight decay rate 
learner.get_weight_decay()
learner.set_weight_decay()



**Weight decay** 
Weight decay is a form of regularization, we will use it in order to reduce overfitting caused by target classes imbalance.
By default, ktrain uses no weight decay, but for our model we will set default weight decay rate of 0.01 implemented using the AdamWeightDecay optimizer.

Next two sections are commented as training takes a lot of time and computationally expensive, trained model was saved and can be downloaded by link: https://files.fm/u/2msz9eexf – file `sentiment_prediction_v2.data-00000-of-00001`.

In [79]:
#Create TensorBoard calback to use it later to analyse model performance
#tbCallBack = tf.keras.callbacks.TensorBoard(log_dir='/tmp/Tensor_Board', histogram_freq=0, write_graph=True, write_images=True)

#Training the model 
#learner.fit_onecycle(lr=1e-5,
#                     epochs=3,
#                    callbacks=[tbCallBack])



begin training using onecycle policy with max lr of 1e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x18d006f8ac0>

In [107]:
#Save trained model
#learner.model.save_weights("sentiment_prediction_v2")