## Installing and importing the necessary Libraries


In [None]:
## Installing sentence transformers
!pip install sentence-transformers==2.7.0

Collecting sentence-transformers==2.7.0
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


In [None]:
## Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report, roc_curve, roc_auc_score, auc, make_scorer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, models
from sklearn.metrics import accuracy_score, f1_score
from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation
from torch.utils.data import DataLoader
from matplotlib import pyplot as plt
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import GridSearchCV
import random
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import ConfusionMatrixDisplay
import warnings
warnings.filterwarnings("ignore")

## Reading in the data

In [None]:
# Reading in the Yelp test data
splits = {'train': 'train.csv', 'test': 'test.csv'}
df_test = pd.read_csv("hf://datasets/yassiracharki/Yelp_Reviews_for_Binary_Senti_Analysis/" + splits["test"])

In [None]:
df_test.head()

Unnamed: 0,class_index,review_title,review_text
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


## Classifying the reviews as numeric labels - positive as 1 and negative as 0

In [None]:
#Classifying the positive reviews as class 2 , neutral reviews as class 1 and negative reviews as class 0 under column "sentiment"
df_test['class_index'] = df_test['class_index'].apply(lambda rating : 1 if rating ==2 else 0)

In [None]:
df_test.head()

Unnamed: 0,class_index,review_title,review_text
0,1,Great CD,My lovely Pat has one of the GREAT voices of h...
1,1,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,0,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,1,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,1,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


In [None]:
## Renaming "text" as "Text"
df_test=df_test.rename(columns={"review_text": "Text"})
df_test=df_test.rename(columns={"class_index": "label"})
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   label         400000 non-null  int64 
 1   review_title  399976 non-null  object
 2   Text          400000 non-null  object
dtypes: int64(1), object(2)
memory usage: 9.2+ MB


## Data Distribution

In [None]:
df_test.label.value_counts(ascending=True)

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,200000
0,200000


In [None]:
# df1.groupby('sentiment').size().plot(kind='pie',
#                                        y = "sentiment",
#                                        label = "Type",
#                                        autopct='%1.1f%%')

## Preparing the data for encoding

In [None]:
## Converting to list for downstream processing
X_test = df_test['Text']
X_test_list=df_test['Text'].tolist()
y_test=df_test['label']

In [None]:
##loading the fine-tuned and trained models from IMDB dataset
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
import joblib
filename = 'fine_tuned_roberta_model_cosine.sav'
filename1 = 'svm_model.sav'
model = joblib.load('/content/drive/MyDrive/fine_tuned_roberta_model_cosine.sav')
svm_model = joblib.load('/content/drive/MyDrive/svm_model.sav')

Mounted at /content/drive


In [None]:
# Encoding the test data using the fine-tuned transformer model from IMDB dataset
vect_test = model.encode(X_test_list, show_progress_bar = True)

Batches:   0%|          | 0/12500 [00:00<?, ?it/s]

## Running the fine-tuned and trained model from IMDB dataset for classification

In [None]:
# Running the fine-tuned and trained model from IMDB dataset for classification
from sklearn import svm
from sklearn.svm import LinearSVC
# evaluate the model
y_pred = svm_model.predict(vect_test)
conf_m = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred,digits=4)
# y_pred_proba = svm_model.predict_proba(vect_test)
#auc = roc_auc_score(y_test, y_pred_proba)
print("Confusion Matrix - \n",conf_m)
print("Classification report - \n",report)
#print("AUC score - \n",auc)

Confusion Matrix - 
 [[17623  1377]
 [  612 18388]]
Classification report - 
               precision    recall  f1-score   support

           0     0.9664    0.9275    0.9466     19000
           1     0.9303    0.9678    0.9487     19000

    accuracy                         0.9477     38000
   macro avg     0.9484    0.9477    0.9476     38000
weighted avg     0.9484    0.9477    0.9476     38000

