In [1]:
from bs4 import BeautifulSoup
import requests  
import numpy as np
import pandas as pd
from langdetect import detect
import re
import pickle
from string import punctuation 
import nltk
import nltk.data
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

#### Webscraping goodreads website for getting reviews of a book
##### To get the link for the required book 

In [2]:
data = {'q': "The Razor's Edge"}
book_url = "https://www.goodreads.com/search"
req = requests.get(book_url, params=data)

book_soup = BeautifulSoup(req.text, 'html.parser')

titles=book_soup.find_all('a', class_ = 'bookTitle')
title=[]
link=[]
for bookname in titles:
    title.append(bookname.get_text())
    link.append(bookname['href'])

##### From all the links first link is the most closest search 

In [3]:
rev="http://goodreads.com"+link[0]
rev_url = requests.get(rev)
rev_soup=BeautifulSoup(rev_url.content, 'html.parser')

##### Getting reviews from the web page of the book

In [4]:
rev_list=[]
for x in rev_soup.find_all("section", {"class": "ReviewText"}):
    rev_list.append(x.text)

In [5]:
df=pd.DataFrame(rev_list, columns=['Reviews'])
df

Unnamed: 0,Reviews
0,\nIn all big cities there are self-contained g...
1,"Books like this, that I’ve read so long ago in..."
2,(Book 570 From 1001 Books) - The Razor’s Edge ...
3,The best novel I've read since joining Goodrea...
4,ASPRO IL CAMMINO VERSO LA SALVEZZAIl primo fil...
5,"Oh, Mr. Maugham, there are moments when I love..."
6,In 1919 war hero Larry (Laurence) Darrell retu...
7,Tracing the intimate lives of representative B...
8,This has to be the most endearing and accessib...
9,"A Timeless, stirring drama, scaling the height..."


##### From all the languages in the reviews, selecting the english language reviews

In [6]:
def detect_en(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [7]:
df = df[df['Reviews'].apply(detect_en)]
df=df.reset_index()
df

Unnamed: 0,index,Reviews
0,0,\nIn all big cities there are self-contained g...
1,1,"Books like this, that I’ve read so long ago in..."
2,3,The best novel I've read since joining Goodrea...
3,5,"Oh, Mr. Maugham, there are moments when I love..."
4,6,In 1919 war hero Larry (Laurence) Darrell retu...
5,7,Tracing the intimate lives of representative B...
6,8,This has to be the most endearing and accessib...
7,9,"A Timeless, stirring drama, scaling the height..."
8,10,"In Asian countries, the custom of “home leavin..."
9,11,"It took me a long time to read this book, this..."


In [41]:
#df.to_csv("razorsedge.csv",index=False,header=False)

##### Cleaning the text

In [8]:
def text_cleaning(text):
   
    text=re.sub("\(.*?\)","",text)

    text = re.sub(r"[^A-Za-z]", " ", str(text))
    
     #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
        
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)
    text=text.lower()
    
    return text 

In [9]:
df['cleaned_review'] = df['Reviews'].apply(lambda x: text_cleaning(x))


In [10]:
df

Unnamed: 0,index,Reviews,cleaned_review
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...


##### Testing the reviews data for emotions using model

In [11]:
#importing libraries for models and nlp tasks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

##### Emotions model (class imbalance in training data)

In [12]:
tfidf_vectorizer1=pickle.load(open('../models/tfidf_vect.pkl','rb'))

In [13]:
test_tfidf1 = tfidf_vectorizer1.transform(df['cleaned_review'])

In [14]:
test_model_lr1=pickle.load(open('../models/lr_mn.pkl','rb'))

In [15]:
ytest_pred1=test_model_lr1.predict(test_tfidf1)
ytest_pred1

array([1, 1, 1, 4, 1, 1, 1, 0, 1, 2, 1, 1, 3, 0, 1, 1, 1, 5, 0, 1, 0, 1])

In [16]:
df['Predicted_label_imbalance']=ytest_pred1

In [17]:
emotion1 = pd.read_csv('../models/emotions.csv')


In [18]:
dic_emotions1=emotion1.to_dict('series')
dic_emotions1['Emotion']

0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
Name: Emotion, dtype: object

In [19]:
df['Predicted_emotion_imbalance'] = df['Predicted_label_imbalance'].map(dic_emotions1['Emotion'])


##### Undersampling model

In [20]:
tfidf_vectorizer2=pickle.load(open('../models/tfidf_vect_undersampling.pkl','rb'))

In [21]:
test_tfidf2 = tfidf_vectorizer2.transform(df['cleaned_review'])

In [22]:
test_model_lr2=pickle.load(open('../models/lr_neutral.pkl','rb'))

In [23]:
ytest_pred2=test_model_lr2.predict(test_tfidf2)
ytest_pred2

array([6, 6, 1, 4, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 6, 6, 5, 6, 6, 6, 6])

In [24]:
df['Predicted_label_undersampling']=ytest_pred2

In [25]:
emotion2 = pd.read_csv('../models/emotions_neutral.csv')


In [26]:
dic_emotions2=emotion2.to_dict('series')
dic_emotions2['Emotion']

0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
6     neutral
Name: Emotion, dtype: object

In [27]:
df['Predicted_emotion_undersampling'] = df['Predicted_label_undersampling'].map(dic_emotions2['Emotion'])


In [28]:
df

Unnamed: 0,index,Reviews,cleaned_review,Predicted_label_imbalance,Predicted_emotion_imbalance,Predicted_label_undersampling,Predicted_emotion_undersampling
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,1,joy,6,neutral
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,1,joy,6,neutral
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,1,joy,1,joy
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,4,fear,4,fear
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,1,joy,6,neutral
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,1,joy,6,neutral
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,1,joy,6,neutral
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,0,sadness,6,neutral
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,1,joy,5,surprise
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,2,love,6,neutral


In [38]:
df['Reviews'][7]

'A Timeless, stirring drama, scaling the heights of ecstasy to the dregs of utter despair."The sharp edge of a razor is difficult to pass over; thus the wise say the path to Salvation is hard." (Paraphrased from the Katha Upanishad)Larry Darrell is a likable fellow, engaged to young socialite, Isabel Bradley. Larry goes off to war, but returns a changed man. He breaks his engagement to Isabel and leaves his former life behind, and sets off on a series of spiritual quests. (My teen self fell in love with the ideal that was Larry Darrell!)Larry Darrell was as close to Nirvana as a human could be, according to the narrator in this story. I just saw Larry as a simple, decent person who took life as it came and made the best of things, refusing to be sucked into the sham that was success and social status.There were a few somewhat decent movie adaptations of this book, but I was totally upset when comedian Bill Murray was cast in the part of the luminous Larry Darrell. Talk about miscasting

In [39]:
df['Reviews'][9]

'It took me a long time to read this book, this beautiful book, this excellent book. I took time because every sentence deserves to would read carefully. It is indeed serving by subtle prose, sought after in its simplicity.The stories are complete. They demonstrate how each life carries a greater or lesser share of tragedy and ridicule; happiness cannot be an exact science. However, it is happy that each has its definition: it can hide in futility like the Absolute. They also allow you to position yourself facing each of the characters described in this beautiful book, this lovely book.'

In [40]:
df['Reviews'][12]

"I didn't love it as much as I expected. The premise that Eastern philosophy has something to offer us in the West just isn't as novel as when this book was originally published. Maugham's description of upper crust society in Paris is bitchy and wonderfully astute at times. But, like most authors, he found it easier to describe the sinners than the saints. Larry Darrell, the saint of this book, just doesn't seem human or interesting. He and his quest for enlightenment and/or belief in God are one big yawnfest. All Maugham can do is describe Larry's scintillating eyes and his smile over and over and over again and by the end of the book, even Maugham is apologizing for that. Also, Maugham allows himself to be the first person narrator and, as such, does more than his fair share of self-aggrandizing in the book. He befriends prostitutes down on their luck, flies to the deathbed of people he's mildly acquainted with and even pays the funeral expenses of heroin-addicted nymphomaniacs. Wha