In [1]:
from bs4 import BeautifulSoup
import requests  
import numpy as np
import pandas as pd
from langdetect import detect
import re
import pickle
import torch
from string import punctuation 
import nltk
import nltk.data
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

In [2]:
#importing libraries for models and nlp tasks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
import datasets
from datasets import load_dataset
from datasets import load_metric

from sklearn.model_selection import train_test_split


from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer


from transformers import AutoModelForSequenceClassification


In [4]:
tfidf_vectorizer=pickle.load(open('../tfidfvectors/tfidf_vect_clean.pkl','rb'))


In [5]:
test_model_lr_imb=pickle.load(open('../models/lr_mn_clean.pkl','rb'))
test_model_lr_cw=pickle.load(open('../models/lr_mn_clean_cw.pkl','rb'))

In [6]:
emotion = pd.read_csv('../labels_prediction/emotions.csv')
dic_emotions=emotion.to_dict('series')

print(dic_emotions['emotion'])


0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
Name: emotion, dtype: object


#### Webscraping goodreads website for getting reviews of a book
##### To get the link for the required book 

In [7]:
data = {'q': "The Razor's Edge"}
book_url = "https://www.goodreads.com/search"
req = requests.get(book_url, params=data)

book_soup = BeautifulSoup(req.text, 'html.parser')

titles=book_soup.find_all('a', class_ = 'bookTitle')
title=[]
link=[]
for bookname in titles:
    title.append(bookname.get_text())
    link.append(bookname['href'])

##### From all the links first link is the most closest search 

In [8]:
rev="http://goodreads.com"+link[0]
rev_url = requests.get(rev)
rev_soup=BeautifulSoup(rev_url.content, 'html.parser')

##### Getting reviews from the web page of the book

In [9]:
rev_list=[]
for x in rev_soup.find_all("section", {"class": "ReviewText"}):
    rev_list.append(x.text)

In [10]:
df=pd.DataFrame(rev_list, columns=['reviews'])
df

Unnamed: 0,reviews
0,\nIn all big cities there are self-contained g...
1,"Books like this, that I’ve read so long ago in..."
2,(Book 570 From 1001 Books) - The Razor’s Edge ...
3,The best novel I've read since joining Goodrea...
4,ASPRO IL CAMMINO VERSO LA SALVEZZAIl primo fil...
5,"Oh, Mr. Maugham, there are moments when I love..."
6,In 1919 war hero Larry (Laurence) Darrell retu...
7,Tracing the intimate lives of representative B...
8,This has to be the most endearing and accessib...
9,"A Timeless, stirring drama, scaling the height..."


##### From all the languages in the reviews, selecting the english language reviews

In [11]:
def detect_en(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [12]:
df = df[df['reviews'].apply(detect_en)]
df=df.reset_index()
df

Unnamed: 0,index,reviews
0,0,\nIn all big cities there are self-contained g...
1,1,"Books like this, that I’ve read so long ago in..."
2,3,The best novel I've read since joining Goodrea...
3,5,"Oh, Mr. Maugham, there are moments when I love..."
4,6,In 1919 war hero Larry (Laurence) Darrell retu...
5,7,Tracing the intimate lives of representative B...
6,8,This has to be the most endearing and accessib...
7,9,"A Timeless, stirring drama, scaling the height..."
8,10,"In Asian countries, the custom of “home leavin..."
9,11,"It took me a long time to read this book, this..."


In [13]:
dataset=df.copy()

##### Cleaning the text

In [14]:
def text_cleaning(text):
   
    text=re.sub("\(.*?\)","",text)

    text = re.sub(r"[^A-Za-z]", " ", str(text))
    
     #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
        
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)
    text=text.lower()
    
    return text 

In [15]:
df['cleaned_review'] = df['reviews'].apply(lambda x: text_cleaning(x))
df = df[df['cleaned_review'].map(len) > 0]


In [16]:
df

Unnamed: 0,index,reviews,cleaned_review
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...


##### Testing the reviews data for emotions using model

In [17]:
test_tfidf = tfidf_vectorizer.transform(df['cleaned_review'])

ytest_pred_imb=test_model_lr_imb.predict(test_tfidf)
ytest_pred_cw=test_model_lr_cw.predict(test_tfidf)

In [18]:
df['predicted_label_imb']=ytest_pred_imb
df['predicted_label_cw']=ytest_pred_cw

In [19]:
df['predicted_emotion_imb'] = df['predicted_label_imb'].map(dic_emotions['emotion'])
df['predicted_emotion_cw'] = df['predicted_label_cw'].map(dic_emotions['emotion'])


In [20]:
percentage_emotions=(df['predicted_emotion_cw'].value_counts(normalize=True)*100).to_dict()
type(percentage_emotions)

dict

In [21]:

percentage_emotions = {k: int(round(v, 0)) for k, v in percentage_emotions.items()}
percentage_emotions


{'joy': 45, 'surprise': 23, 'sadness': 9, 'love': 9, 'anger': 9, 'fear': 5}

BERT fine tuned model

In [22]:
bert_model=AutoModelForSequenceClassification.from_pretrained('../models/bert_finetuned_model2')
trainer=Trainer(bert_model)

tokenizer=AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer

loading configuration file config.json from cache at /Users/phanisingaraju/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/phanisingaraju/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0

PreTrainedTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [23]:
test_dataset = datasets.Dataset.from_dict(dataset)


In [24]:
my_dataset_dict = datasets.DatasetDict({"test":test_dataset})


In [25]:
my_dataset_dict

DatasetDict({
    test: Dataset({
        features: ['index', 'reviews'],
        num_rows: 22
    })
})

In [26]:
def tokenize_data(example):
    return tokenizer(example['reviews'],truncation=True, padding='max_length')

my_dataset_dict = my_dataset_dict.map(tokenize_data, batched=True)
#my_dataset_dict = my_dataset_dict.map(remove_columns='index')
my_dataset_dict

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    test: Dataset({
        features: ['index', 'reviews', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 22
    })
})

In [27]:
predicted_results=trainer.predict(my_dataset_dict['test'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: reviews, index. If reviews, index are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 22
  Batch size = 8


In [28]:
predicted_results.predictions.shape

(22, 6)

In [29]:
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist() 
predicted_labels

[5, 3, 1, 1, 1, 1, 1, 0, 1, 2, 1, 1, 3, 0, 1, 1, 1, 1, 3, 1, 0, 5]

In [30]:
df['predicted_label_bert']=predicted_labels

In [31]:
df['predicted_emotion_bert'] = df['predicted_label_bert'].map(dic_emotions['emotion'])


In [32]:
df

Unnamed: 0,index,reviews,cleaned_review,predicted_label_imb,predicted_label_cw,predicted_emotion_imb,predicted_emotion_cw,predicted_label_bert,predicted_emotion_bert
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,1,5,joy,surprise,5,surprise
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,1,5,joy,surprise,3,anger
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,1,1,joy,joy,1,joy
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,4,4,fear,fear,1,joy
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,1,1,joy,joy,1,joy
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,1,1,joy,joy,1,joy
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,1,1,joy,joy,1,joy
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,0,0,sadness,sadness,0,sadness
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,1,5,joy,surprise,1,joy
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,1,2,joy,love,2,love


In [33]:
df.columns

Index(['index', 'reviews', 'cleaned_review', 'predicted_label_imb',
       'predicted_label_cw', 'predicted_emotion_imb', 'predicted_emotion_cw',
       'predicted_label_bert', 'predicted_emotion_bert'],
      dtype='object')

Comparing the predictions from all the three models

In [34]:
compare=df[['reviews','cleaned_review','predicted_emotion_imb', 'predicted_emotion_cw','predicted_emotion_bert']]
compare

Unnamed: 0,reviews,cleaned_review,predicted_emotion_imb,predicted_emotion_cw,predicted_emotion_bert
0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,joy,surprise,surprise
1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,joy,surprise,anger
2,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,joy,joy,joy
3,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,fear,fear,joy
4,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,joy,joy,joy
5,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,joy,joy,joy
6,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,joy,joy,joy
7,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,sadness,sadness,sadness
8,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,joy,surprise,joy
9,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,joy,love,love


In [146]:
df.groupby(['predicted_emotion_imb'])['predicted_emotion_imb'].count() 

predicted_emotion_imb
fear         1
joy         17
sadness      3
surprise     1
Name: predicted_emotion_imb, dtype: int64

In [147]:
df.groupby(['predicted_emotion_cw'])['predicted_emotion_cw'].count() 

predicted_emotion_cw
anger        2
fear         1
joy         10
love         2
sadness      2
surprise     5
Name: predicted_emotion_cw, dtype: int64

In [148]:
df.groupby(['predicted_emotion_bert'])['predicted_emotion_bert'].count() 

predicted_emotion_bert
anger        3
joy         13
love         1
sadness      3
surprise     2
Name: predicted_emotion_bert, dtype: int64

In [35]:
df['reviews'][0]

"\nIn all big cities there are self-contained groups that exist without intercommunication, small worlds within a greater world that lead their lives, their members dependent upon one another for companionship, as though they inhabited islands separated from each other by an unnavigable strait. Of no city, in my experience, is this more true than of Paris.\n4 ½ stars. I liked this book a lot. Much more than Maugham's Of Human Bondage, but not quite as much as The Painted Veil. The first person minor perspective works really well here. Maugham inserts himself into the story, but mostly exists as an observer and messenger, retelling the tales passed to him by Elliott, Larry, Isabel and others. I think this is one of the most interesting narrative structures-- we are still a part of the story, not looking down on it, but we are also an outsider peering in. It seems to be exactly the right amount of proximity and distance to suit me.I also like Maugham best when he is fondly mocking human 

In [36]:
df['reviews'][1]

'Books like this, that I’ve read so long ago in my past, come back even now to haunt me, like the lilting, plaintive refrain of an old Beatles Love song!But I only started it in the mid-seventies. Even back then, working in soulless offices, I needed to replenish my heart in long, lingering draughts. So how did I do that?If you guessed by hanging around bookstores you nailed it! There was a Centretown bookshop of irregular modern architectural design right at the hub of the nearby city - my wonderful Dad used to cuss and call it Confusion Square - a hub which would have been ironically termed the heart of the city.Cause it wasn’t. Postmodern cities are quite heartless, as their great refabricator Henri Lefebvre used to say.No: the heart of the city was its bookstores - this one, Classics (my fave chain back then) - and W.H. Smith, Coles, Prospero and later Chapters, all within that two- or three-block epicentre.Books, as I say, replenish my heart. Always have - ever since that halcyon 

In [37]:
df['reviews'][3]

'Oh, Mr. Maugham, there are moments when I love you so much I could burst. Moments when I wish there were a six star rating, so I could put it into your hands and say "I got that part and it resonated with me." Moments when I want to say, "enough of that, get back to the story", only to find That is the story, That is the heart.This novel made me wish to live in the post WWI twenties and have endless possibilities open to me. It made me examine the life I have lived and wonder if I couldn\'t have gotten more out of it if I had been bolder or less worried.It\'s strange how many people suffer from it (fear). I don\'t mean fear of closed spaces and fear of heights, but fear of death and, what\'s worse, fear of life. Often they\'re people who seem in the best of health, prosperous, without any worry, and yet they\'re tortured by it. I\'ve sometimes thought it was the most besetting humour of men, and I asked myself at one time if it was due to some deep animal instinct that man has inherit