#### Importing important libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from googletrans import Translator




#### importing the dataset

In [3]:
dataFrame = pd.read_csv("dioptra_reviews.csv")

In [4]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,author_gender,age_category,book_genre,star_rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas
0,0,review_1638234481,Nineteen eighty-four,English,English,In a post-Atomic War world three large states ...,English,Joseph Sparrow,male,Adult,Literary fiction,4.0,post atomic war world large state emerge story...,0,174,0
1,1,review_2022355251,Nineteen eighty-four,English,English,"1984 is not a book I would choose myself, beca...",English,Lysanne,female,Adult,Literary fiction,1.0,book choose dystopia theme like kind story lik...,0,18,0
2,2,review_1297147342,Nineteen eighty-four,English,English,"4.5. Woooow, es la primera distopía que se gan...",Spanish,L. C. Julia,unknown,Adult,Literary fiction,4.0,distopía ganar estrellar y jajaja humor ironic...,0,17,0
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,5.0,relevant reading understand future look past u...,0,21,0
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,5.0,book read ready read sure piece fall right bel...,0,26,0


## Data Cleaning

In [5]:
dataFrame.dtypes

Unnamed: 0                int64
id                       object
book_title               object
original_language        object
edition_language         object
text                     object
language                 object
author                   object
author_gender            object
age_category             object
book_genre               object
star_rating             float64
tokenised_text           object
mentions_translation      int64
n_tokens                  int64
n_translation_lemmas      int64
dtype: object

In [6]:
dataFrame.shape

(270400, 16)

In [7]:
dataFrame.apply(lambda x: x.count())

Unnamed: 0              270400
id                      270400
book_title              270400
original_language       270400
edition_language        270400
text                    270400
language                270400
author                  270400
author_gender           270400
age_category            270400
book_genre              270400
star_rating             266437
tokenised_text          270400
mentions_translation    270400
n_tokens                270400
n_translation_lemmas    270400
dtype: int64

In [8]:
dataFrame[dataFrame.isnull().any(axis=1)].count()

Unnamed: 0              3963
id                      3963
book_title              3963
original_language       3963
edition_language        3963
text                    3963
language                3963
author                  3963
author_gender           3963
age_category            3963
book_genre              3963
star_rating                0
tokenised_text          3963
mentions_translation    3963
n_tokens                3963
n_translation_lemmas    3963
dtype: int64

#### dropping null values

In [9]:
dataFrame = dataFrame.dropna()

In [10]:
dataFrame['language'].value_counts()

English       200473
Spanish        30918
Italian        11492
Portuguese      6853
French          6565
Dutch           5356
German          4780
Name: language, dtype: int64

#### dropping rows with reviews in languages other than english

In [11]:
dataFrame = dataFrame[dataFrame['language'] == 'English']

In [12]:
dataFrame['language'].value_counts()

English    200473
Name: language, dtype: int64

In [13]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,author_gender,age_category,book_genre,star_rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas
0,0,review_1638234481,Nineteen eighty-four,English,English,In a post-Atomic War world three large states ...,English,Joseph Sparrow,male,Adult,Literary fiction,4.0,post atomic war world large state emerge story...,0,174,0
1,1,review_2022355251,Nineteen eighty-four,English,English,"1984 is not a book I would choose myself, beca...",English,Lysanne,female,Adult,Literary fiction,1.0,book choose dystopia theme like kind story lik...,0,18,0
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,5.0,relevant reading understand future look past u...,0,21,0
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,5.0,book read ready read sure piece fall right bel...,0,26,0
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the `wisdom' of the twenti...,English,Jordan Forster,male,Adult,Literary fiction,3.0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0


In [14]:
dataFrame['author_gender'].value_counts()

female           98190
unknown          42691
male             41677
mostly_female    10273
mostly_male       5247
andy              2395
Name: author_gender, dtype: int64

#### converting genders mostly_male and mostly_female to male and female respectively

In [15]:
dataFrame['author_gender'] = dataFrame['author_gender'].replace({'mostly_male': 'male', 'mostly_female': 'female'})

In [16]:
dataFrame['author_gender'].value_counts()

female     108463
male        46924
unknown     42691
andy         2395
Name: author_gender, dtype: int64

In [17]:
dataFrame = dataFrame.drop(dataFrame[dataFrame['author_gender'] == 'andy'].index)

In [18]:
dataFrame['author_gender'].value_counts()

female     108463
male        46924
unknown     42691
Name: author_gender, dtype: int64

In [19]:
dataFrame['age_category'].value_counts()

Adult          137057
Children        39736
Young adult     21285
Name: age_category, dtype: int64

In [20]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,author_gender,age_category,book_genre,star_rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas
0,0,review_1638234481,Nineteen eighty-four,English,English,In a post-Atomic War world three large states ...,English,Joseph Sparrow,male,Adult,Literary fiction,4.0,post atomic war world large state emerge story...,0,174,0
1,1,review_2022355251,Nineteen eighty-four,English,English,"1984 is not a book I would choose myself, beca...",English,Lysanne,female,Adult,Literary fiction,1.0,book choose dystopia theme like kind story lik...,0,18,0
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,5.0,relevant reading understand future look past u...,0,21,0
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,5.0,book read ready read sure piece fall right bel...,0,26,0
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the `wisdom' of the twenti...,English,Jordan Forster,male,Adult,Literary fiction,3.0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0


In [21]:
dataFrame['star_rating'] = dataFrame['star_rating'].apply(lambda x: 1 if x > 3 else 0)
dataFrame = dataFrame.rename(columns={'star_rating': 'rating'})

In [22]:
dataFrame = dataFrame.rename(columns={'author_gender': 'gender'})
# dataFrame = dataFrame.rename(columns={'rating': 'gender'})

In [23]:
print(dataFrame['text'][4])
dataFrame['tokenised_text'][1]

This is one of those books you should not read if you are not ready. I will read it again within a few years, and I'm sure the pieces will fall right where they belong. Meanwhile, I will let it stir in my mind for as long as it takes. The best about this book is the writing. So fine. So real. Technical: I take my hat off to Simon Prebble. Excellent narrator. The quality of the audiobook is superb.


'book choose dystopia theme like kind story like story writing style descriptive difficult read summarise favourite kind book'

In [24]:
dataFrame['text'] = dataFrame['text'].str.replace('[^a-zA-Z\s]', '')
dataFrame['tokenised_text'] = dataFrame['tokenised_text'].str.replace('[^a-zA-Z\s]', '')

  dataFrame['text'] = dataFrame['text'].str.replace('[^a-zA-Z\s]', '')
  dataFrame['tokenised_text'] = dataFrame['tokenised_text'].str.replace('[^a-zA-Z\s]', '')


#### importing BertTokenizer for tokenising to textual reviews

In [25]:
from transformers import BertTokenizer

In [26]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,gender,age_category,book_genre,rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas
0,0,review_1638234481,Nineteen eighty-four,English,English,In a postAtomic War world three large states e...,English,Joseph Sparrow,male,Adult,Literary fiction,1,post atomic war world large state emerge story...,0,174,0
1,1,review_2022355251,Nineteen eighty-four,English,English,is not a book I would choose myself because o...,English,Lysanne,female,Adult,Literary fiction,0,book choose dystopia theme like kind story lik...,0,18,0
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,1,relevant reading understand future look past u...,0,21,0
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,1,book read ready read sure piece fall right bel...,0,26,0
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the wisdom of the twentiet...,English,Jordan Forster,male,Adult,Literary fiction,0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0


In [27]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [28]:
dataFrame['bert_tokenized_text'] = dataFrame['text'].apply(lambda x: tokenizer.tokenize(x))

In [29]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,gender,age_category,book_genre,rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas,bert_tokenized_text
0,0,review_1638234481,Nineteen eighty-four,English,English,In a postAtomic War world three large states e...,English,Joseph Sparrow,male,Adult,Literary fiction,1,post atomic war world large state emerge story...,0,174,0,"[in, a, post, ##ato, ##mic, war, world, three,..."
1,1,review_2022355251,Nineteen eighty-four,English,English,is not a book I would choose myself because o...,English,Lysanne,female,Adult,Literary fiction,0,book choose dystopia theme like kind story lik...,0,18,0,"[is, not, a, book, i, would, choose, myself, b..."
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,1,relevant reading understand future look past u...,0,21,0,"[relevant, reading, for, today, to, understand..."
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,1,book read ready read sure piece fall right bel...,0,26,0,"[this, is, one, of, those, books, you, should,..."
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the wisdom of the twentiet...,English,Jordan Forster,male,Adult,Literary fiction,0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0,"[it, is, tempting, with, the, wisdom, of, the,..."


In [30]:
dataFrame['bert_tokenized_text'][0]

['in',
 'a',
 'post',
 '##ato',
 '##mic',
 'war',
 'world',
 'three',
 'large',
 'states',
 'emerge',
 'oceania',
 'east',
 '##asia',
 'and',
 'eu',
 '##rasia',
 'our',
 'story',
 'takes',
 'place',
 'in',
 'london',
 'which',
 'is',
 'ruled',
 'by',
 'oceania',
 'the',
 'government',
 'is',
 'known',
 'as',
 'the',
 'party',
 'which',
 'has',
 'complete',
 'and',
 'utter',
 'control',
 'over',
 'its',
 'citizens',
 'enter',
 'winston',
 'smith',
 'who',
 'is',
 'a',
 'medium',
 'class',
 'citizen',
 'and',
 'works',
 'with',
 'the',
 'party',
 'his',
 'resentment',
 'towards',
 'the',
 'party',
 'continually',
 'grows',
 'as',
 'he',
 'commits',
 'crimes',
 'such',
 'as',
 'having',
 'an',
 'affair',
 'and',
 'actively',
 'avoiding',
 'the',
 'party',
 '##s',
 'security',
 'devices',
 'however',
 'in',
 'the',
 'end',
 'no',
 'one',
 'can',
 'truly',
 'escape',
 'the',
 'party',
 '##s',
 'grasp',
 'the',
 'story',
 'takes',
 'place',
 'in',
 'a',
 'd',
 '##yst',
 '##op',
 '##ic',
 'lo

In [31]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,gender,age_category,book_genre,rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas,bert_tokenized_text
0,0,review_1638234481,Nineteen eighty-four,English,English,In a postAtomic War world three large states e...,English,Joseph Sparrow,male,Adult,Literary fiction,1,post atomic war world large state emerge story...,0,174,0,"[in, a, post, ##ato, ##mic, war, world, three,..."
1,1,review_2022355251,Nineteen eighty-four,English,English,is not a book I would choose myself because o...,English,Lysanne,female,Adult,Literary fiction,0,book choose dystopia theme like kind story lik...,0,18,0,"[is, not, a, book, i, would, choose, myself, b..."
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,1,relevant reading understand future look past u...,0,21,0,"[relevant, reading, for, today, to, understand..."
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,1,book read ready read sure piece fall right bel...,0,26,0,"[this, is, one, of, those, books, you, should,..."
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the wisdom of the twentiet...,English,Jordan Forster,male,Adult,Literary fiction,0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0,"[it, is, tempting, with, the, wisdom, of, the,..."


In [32]:
from keras.preprocessing.sequence import pad_sequences




In [33]:
# Converting tokenized text to token IDs
dataFrame['bert_tokenized_text_ids'] = dataFrame['bert_tokenized_text'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))

In [34]:
# Padding the tokenized sequences to a length of 512 which is suitable for BERT input
max_length = 512
padded_sequences = pad_sequences(dataFrame['bert_tokenized_text_ids'], maxlen=max_length, padding='post', truncating='post')
dataFrame['padded_sequences'] = list(padded_sequences)

In [35]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,gender,age_category,book_genre,rating,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas,bert_tokenized_text,bert_tokenized_text_ids,padded_sequences
0,0,review_1638234481,Nineteen eighty-four,English,English,In a postAtomic War world three large states e...,English,Joseph Sparrow,male,Adult,Literary fiction,1,post atomic war world large state emerge story...,0,174,0,"[in, a, post, ##ato, ##mic, war, world, three,...","[1999, 1037, 2695, 10610, 7712, 2162, 2088, 20...","[1999, 1037, 2695, 10610, 7712, 2162, 2088, 20..."
1,1,review_2022355251,Nineteen eighty-four,English,English,is not a book I would choose myself because o...,English,Lysanne,female,Adult,Literary fiction,0,book choose dystopia theme like kind story lik...,0,18,0,"[is, not, a, book, i, would, choose, myself, b...","[2003, 2025, 1037, 2338, 1045, 2052, 5454, 287...","[2003, 2025, 1037, 2338, 1045, 2052, 5454, 287..."
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,Literary fiction,1,relevant reading understand future look past u...,0,21,0,"[relevant, reading, for, today, to, understand...","[7882, 3752, 2005, 2651, 2000, 3305, 1996, 292...","[7882, 3752, 2005, 2651, 2000, 3305, 1996, 292..."
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,Literary fiction,1,book read ready read sure piece fall right bel...,0,26,0,"[this, is, one, of, those, books, you, should,...","[2023, 2003, 2028, 1997, 2216, 2808, 2017, 232...","[2023, 2003, 2028, 1997, 2216, 2808, 2017, 232..."
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the wisdom of the twentiet...,English,Jordan Forster,male,Adult,Literary fiction,0,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0,"[it, is, tempting, with, the, wisdom, of, the,...","[2009, 2003, 23421, 2007, 1996, 9866, 1997, 19...","[2009, 2003, 23421, 2007, 1996, 9866, 1997, 19..."


In [36]:
# Adding special tokens [CLS] and [SEP]
padded_sequences_with_special_tokens = []
for seq in dataFrame['padded_sequences']:
    padded_sequences_with_special_tokens.append([tokenizer.cls_token_id] + seq + [tokenizer.sep_token_id])

In [37]:
# Creating segment IDs (all zeros for single-sequence classification)
segment_ids = [[0] * max_length for _ in range(len(dataFrame))]

In [38]:
# Creating attention masks (1 for real tokens, 0 for padding tokens)
attention_masks = []
for seq in padded_sequences_with_special_tokens:
    mask = [1] * len(seq) + [0] * (max_length - len(seq))
    attention_masks.append(mask)

In [39]:
# Updating the DataFrame with the formatted sequences
dataFrame['input_ids'] = padded_sequences_with_special_tokens
dataFrame['segment_ids'] = segment_ids
dataFrame['attention_masks'] = attention_masks

In [40]:
dataFrame.head()

Unnamed: 0.1,Unnamed: 0,id,book_title,original_language,edition_language,text,language,author,gender,age_category,...,tokenised_text,mentions_translation,n_tokens,n_translation_lemmas,bert_tokenized_text,bert_tokenized_text_ids,padded_sequences,input_ids,segment_ids,attention_masks
0,0,review_1638234481,Nineteen eighty-four,English,English,In a postAtomic War world three large states e...,English,Joseph Sparrow,male,Adult,...,post atomic war world large state emerge story...,0,174,0,"[in, a, post, ##ato, ##mic, war, world, three,...","[1999, 1037, 2695, 10610, 7712, 2162, 2088, 20...","[1999, 1037, 2695, 10610, 7712, 2162, 2088, 20...","[2202, 1240, 2898, 10813, 7915, 2365, 2291, 22...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,review_2022355251,Nineteen eighty-four,English,English,is not a book I would choose myself because o...,English,Lysanne,female,Adult,...,book choose dystopia theme like kind story lik...,0,18,0,"[is, not, a, book, i, would, choose, myself, b...","[2003, 2025, 1037, 2338, 1045, 2052, 5454, 287...","[2003, 2025, 1037, 2338, 1045, 2052, 5454, 287...","[2206, 2228, 1240, 2541, 1248, 2255, 5657, 307...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,3,review_2067357280,Nineteen eighty-four,English,English,Relevant reading for today To understand the f...,English,Lori Ghany,female,Adult,...,relevant reading understand future look past u...,0,21,0,"[relevant, reading, for, today, to, understand...","[7882, 3752, 2005, 2651, 2000, 3305, 1996, 292...","[7882, 3752, 2005, 2651, 2000, 3305, 1996, 292...","[8085, 3955, 2208, 2854, 2203, 3508, 2199, 312...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,4,review_3122615894,Nineteen eighty-four,English,English,This is one of those books you should not read...,English,Balthazar Hawke,unknown,Adult,...,book read ready read sure piece fall right bel...,0,26,0,"[this, is, one, of, those, books, you, should,...","[2023, 2003, 2028, 1997, 2216, 2808, 2017, 232...","[2023, 2003, 2028, 1997, 2216, 2808, 2017, 232...","[2226, 2206, 2231, 2200, 2419, 3011, 2220, 252...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
6,6,review_135690649,Nineteen eighty-four,English,English,It is tempting with the wisdom of the twentiet...,English,Jordan Forster,male,Adult,...,tempt wisdom dismiss irrelevant book guilty fa...,0,408,0,"[it, is, tempting, with, the, wisdom, of, the,...","[2009, 2003, 23421, 2007, 1996, 9866, 1997, 19...","[2009, 2003, 23421, 2007, 1996, 9866, 1997, 19...","[2212, 2206, 23624, 2210, 2199, 10069, 2200, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [41]:
from transformers import BertModel, BertTokenizer
import torch

In [42]:
from transformers import TFBertModel, BertTokenizer
model = TFBertModel.from_pretrained('bert-base-uncased')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [95]:
datadf = dataFrame.head(1000)

In [96]:
datadf.shape

(1000, 22)

In [97]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
import numpy as np
import time

In [98]:
# Split data into train and test sets
train_df, test_df = train_test_split(datadf, test_size=0.2, random_state=42)

In [99]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.df.iloc[idx]['text'],
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(self.df.iloc[idx]['rating'])
        }


In [158]:
# Define the maximum sequence length
max_length = 256

In [159]:
# Create train and test datasets
train_dataset = CustomDataset(train_df, tokenizer, max_length)
eval_dataset = CustomDataset(test_df, tokenizer, max_length)

In [160]:
# Define the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [161]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='steps',
    save_steps=500,
    eval_steps=100,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

In [162]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [163]:
# Define function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': (predictions == labels).mean(),
        'eval_loss': eval_pred.loss,
        'eval_accuracy': (predictions == labels).mean(),  # Adding eval_accuracy
    }


In [164]:
# # Train the model
# trainer.train()

# # Evaluate the model
# results = trainer.evaluate(eval_dataset)


# Train the model
train_start_time = time.time()
trainer.train()
train_end_time = time.time()

Step,Training Loss,Validation Loss
100,0.6368,0.625693
200,0.4873,0.523401


In [165]:
# Evaluate the model
eval_start_time = time.time()
results = trainer.evaluate(eval_dataset)
eval_end_time = time.time()

In [166]:
# Calculate training time and validation time
training_time = train_end_time - train_start_time
validation_time = eval_end_time - eval_start_time

In [167]:
# Print the keys in the results dictionary
print(results.keys())

dict_keys(['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch'])


In [168]:
from sklearn.metrics import confusion_matrix, classification_report

# Calculate confusion matrix and classification report
predictions = trainer.predict(eval_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
conf_matrix = confusion_matrix(predictions.label_ids, y_pred)
class_report = classification_report(predictions.label_ids, y_pred)

# Print confusion matrix and classification report
print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Confusion Matrix:
[[ 27  43]
 [  4 126]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.39      0.53        70
           1       0.75      0.97      0.84       130

    accuracy                           0.77       200
   macro avg       0.81      0.68      0.69       200
weighted avg       0.79      0.77      0.73       200



In [169]:
# Save the model
trainer.save_model('./sentiment_model')

In [170]:
# Print metrics
print(f"Training loss: {trainer.state.global_step}, Validation loss: {results['eval_loss']}, Training time: {training_time}, Validation time: {validation_time}")


Training loss: 200, Validation loss: 0.5234012603759766, Training time: 3374.1048407554626, Validation time: 120.71433901786804
