In [1]:
from bs4 import BeautifulSoup
import requests  
import numpy as np
import pandas as pd
from langdetect import detect
import re
import pickle
from string import punctuation 
import nltk
import nltk.data
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

In [2]:
#importing libraries for models and nlp tasks
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [3]:
tfidf_vectorizer=pickle.load(open('../tfidfvectors/tfidf_vect_clean.pkl','rb'))


In [4]:
test_model_lr_imb=pickle.load(open('../models/lr_mn_clean.pkl','rb'))
test_model_lr_cw=pickle.load(open('../models/lr_mn_clean_cw.pkl','rb'))

In [5]:
emotion = pd.read_csv('../labels_prediction/emotions.csv')
dic_emotions=emotion.to_dict('series')

print(dic_emotions['emotion'])


0     sadness
1         joy
2        love
3       anger
4        fear
5    surprise
Name: emotion, dtype: object


#### Webscraping goodreads website for getting reviews of a book
##### To get the link for the required book 

In [6]:
data = {'q': "The Razor's Edge"}
book_url = "https://www.goodreads.com/search"
req = requests.get(book_url, params=data)

book_soup = BeautifulSoup(req.text, 'html.parser')

titles=book_soup.find_all('a', class_ = 'bookTitle')
title=[]
link=[]
for bookname in titles:
    title.append(bookname.get_text())
    link.append(bookname['href'])

##### From all the links first link is the most closest search 

In [7]:
rev="http://goodreads.com"+link[0]
rev_url = requests.get(rev)
rev_soup=BeautifulSoup(rev_url.content, 'html.parser')

##### Getting reviews from the web page of the book

In [8]:
rev_list=[]
for x in rev_soup.find_all("section", {"class": "ReviewText"}):
    rev_list.append(x.text)

In [9]:
df=pd.DataFrame(rev_list, columns=['reviews'])
df

Unnamed: 0,reviews
0,\nIn all big cities there are self-contained g...
1,"Books like this, that I’ve read so long ago in..."
2,(Book 570 From 1001 Books) - The Razor’s Edge ...
3,The best novel I've read since joining Goodrea...
4,ASPRO IL CAMMINO VERSO LA SALVEZZAIl primo fil...
5,"Oh, Mr. Maugham, there are moments when I love..."
6,In 1919 war hero Larry (Laurence) Darrell retu...
7,Tracing the intimate lives of representative B...
8,This has to be the most endearing and accessib...
9,"A Timeless, stirring drama, scaling the height..."


##### From all the languages in the reviews, selecting the english language reviews

In [10]:
def detect_en(text):
    try:
        return detect(text) == 'en'
    except:
        return False

In [11]:
df = df[df['reviews'].apply(detect_en)]
df=df.reset_index()
df

Unnamed: 0,index,reviews
0,0,\nIn all big cities there are self-contained g...
1,1,"Books like this, that I’ve read so long ago in..."
2,3,The best novel I've read since joining Goodrea...
3,5,"Oh, Mr. Maugham, there are moments when I love..."
4,6,In 1919 war hero Larry (Laurence) Darrell retu...
5,7,Tracing the intimate lives of representative B...
6,8,This has to be the most endearing and accessib...
7,9,"A Timeless, stirring drama, scaling the height..."
8,10,"In Asian countries, the custom of “home leavin..."
9,11,"It took me a long time to read this book, this..."


In [12]:
#df.to_csv("razorsedge.csv",index=False,header=False)

##### Cleaning the text

In [13]:
def text_cleaning(text):
   
    text=re.sub("\(.*?\)","",text)

    text = re.sub(r"[^A-Za-z]", " ", str(text))
    
     #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    # Remove punctuation from text
    text = "".join([c for c in text if c not in punctuation])
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.split()
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
        
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemmatized_words)
    text=text.lower()
    
    return text 

In [14]:
df['cleaned_review'] = df['reviews'].apply(lambda x: text_cleaning(x))
df = df[df['cleaned_review'].map(len) > 0]


In [15]:
df

Unnamed: 0,index,reviews,cleaned_review
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...


##### Testing the reviews data for emotions using model

In [16]:
test_tfidf = tfidf_vectorizer.transform(df['cleaned_review'])

ytest_pred_imb=test_model_lr_imb.predict(test_tfidf)
ytest_pred_cw=test_model_lr_cw.predict(test_tfidf)

In [17]:
df['predicted_label_imb']=ytest_pred_imb
df['predicted_label_cw']=ytest_pred_cw

In [18]:
df['predicted_emotion_imb'] = df['predicted_label_imb'].map(dic_emotions['emotion'])
df['predicted_emotion_cw'] = df['predicted_label_cw'].map(dic_emotions['emotion'])


In [19]:
df

Unnamed: 0,index,reviews,cleaned_review,predicted_label_imb,predicted_label_cw,predicted_emotion_imb,predicted_emotion_cw
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,1,5,joy,surprise
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,1,5,joy,surprise
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,1,1,joy,joy
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,4,4,fear,fear
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,1,1,joy,joy
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,1,1,joy,joy
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,1,1,joy,joy
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,0,0,sadness,sadness
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,1,5,joy,surprise
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,1,2,joy,love


In [20]:
df.groupby(['predicted_emotion_imb']).count() 

Unnamed: 0_level_0,index,reviews,cleaned_review,predicted_label_imb,predicted_label_cw,predicted_emotion_cw
predicted_emotion_imb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fear,1,1,1,1,1,1
joy,17,17,17,17,17,17
sadness,3,3,3,3,3,3
surprise,1,1,1,1,1,1


In [21]:
df.groupby(['predicted_emotion_cw']).count() 

Unnamed: 0_level_0,index,reviews,cleaned_review,predicted_label_imb,predicted_label_cw,predicted_emotion_imb
predicted_emotion_cw,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
anger,2,2,2,2,2,2
fear,1,1,1,1,1,1
joy,10,10,10,10,10,10
love,2,2,2,2,2,2
sadness,2,2,2,2,2,2
surprise,5,5,5,5,5,5


In [20]:
df['cleaned_review'][0]

'in big city self contained group exist without intercommunication small world within greater world lead life member dependent upon one another companionship though inhabited island separated unnavigable strait of city experience true paris star i liked book lot much maugham of human bondage quite much the painted veil the first person minor perspective work really well maugham insert story mostly exists observer messenger retelling tale passed elliott larry isabel others i think one interesting narrative structure still part story looking also outsider peering it seems exactly right amount proximity distance suit i also like maugham best fondly mocking human nature elliott lovable snob i ever pleasure encountering it introduces narrator young couple larry isabel shy view impending marriage in france civilized country elliott say isabel would sense marry gray instead take larry lover who could deny elliott arch snob also kindest considerate generous men the story journey character purs

In [21]:
df['reviews'][0]

"\nIn all big cities there are self-contained groups that exist without intercommunication, small worlds within a greater world that lead their lives, their members dependent upon one another for companionship, as though they inhabited islands separated from each other by an unnavigable strait. Of no city, in my experience, is this more true than of Paris.\n4 ½ stars. I liked this book a lot. Much more than Maugham's Of Human Bondage, but not quite as much as The Painted Veil. The first person minor perspective works really well here. Maugham inserts himself into the story, but mostly exists as an observer and messenger, retelling the tales passed to him by Elliott, Larry, Isabel and others. I think this is one of the most interesting narrative structures-- we are still a part of the story, not looking down on it, but we are also an outsider peering in. It seems to be exactly the right amount of proximity and distance to suit me.I also like Maugham best when he is fondly mocking human 

In [22]:
df['reviews'][6]

"This has to be the most endearing and accessible of Maugham's books. With the right smattering of philosophy and literary techniques to keep one challenged too.It has been one of the defining books in my life."

In [23]:
df['cleaned_review'][6]

'this endearing accessible maugham book with right smattering philosophy literary technique keep one challenged it one defining book life'

Predict probabilities

In [16]:
test_tfidf = tfidf_vectorizer.transform(df['cleaned_review'])


In [17]:
ytest_pred_imb_prob=test_model_lr_imb.predict_proba(test_tfidf)
ytest_pred_cw_prob=test_model_lr_cw.predict_proba(test_tfidf)

In [18]:
df_imb = pd.DataFrame(ytest_pred_imb_prob, columns = ['sadness','joy','love','anger','fear','surprise'])
df_cw = pd.DataFrame(ytest_pred_cw_prob, columns = ['sadness','joy','love','anger','fear','surprise'])


In [21]:
df_comb_imb=pd.concat([df,df_imb],axis=1)
df_comb_imb

Unnamed: 0,index,reviews,cleaned_review,sadness,joy,love,anger,fear,surprise
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,0.093085,0.355562,0.20197,0.141933,0.091016,0.116434
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,0.088649,0.436857,0.141517,0.125145,0.096049,0.111784
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,0.055531,0.785239,0.037972,0.065887,0.043057,0.012314
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,0.111603,0.194754,0.06501,0.149288,0.404117,0.075227
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,0.121553,0.619313,0.070738,0.123667,0.049137,0.015592
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,0.208649,0.517423,0.056266,0.109441,0.081576,0.026644
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,0.222168,0.369935,0.075385,0.207225,0.099622,0.025665
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,0.630375,0.116501,0.084407,0.10215,0.049827,0.01674
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,0.142796,0.486595,0.100293,0.062969,0.065414,0.141933
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,0.087596,0.370621,0.347861,0.091776,0.071031,0.031113


In [22]:
df_comb_cw=pd.concat([df,df_cw],axis=1)
df_comb_cw

Unnamed: 0,index,reviews,cleaned_review,sadness,joy,love,anger,fear,surprise
0,0,\nIn all big cities there are self-contained g...,in big city self contained group exist without...,0.053233,0.16635,0.250255,0.126977,0.070517,0.332668
1,1,"Books like this, that I’ve read so long ago in...",books like i read long ago past come back even...,0.049869,0.178518,0.195295,0.12218,0.074999,0.379138
2,3,The best novel I've read since joining Goodrea...,the best novel i read since joining goodreads ...,0.069219,0.579881,0.090615,0.137005,0.076924,0.046356
3,5,"Oh, Mr. Maugham, there are moments when I love...",oh mr maugham moment i love much i could burst...,0.070113,0.097173,0.062138,0.146124,0.355723,0.26873
4,6,In 1919 war hero Larry (Laurence) Darrell retu...,in war hero larry darrell return hometown chic...,0.118741,0.417005,0.137913,0.213583,0.069407,0.043352
5,7,Tracing the intimate lives of representative B...,tracing intimate life representative british a...,0.178984,0.385279,0.09999,0.150495,0.107107,0.078145
6,8,This has to be the most endearing and accessib...,this endearing accessible maugham book with ri...,0.17376,0.293422,0.09782,0.266126,0.129699,0.039173
7,9,"A Timeless, stirring drama, scaling the height...",a timeless stirring drama scaling height ecsta...,0.450096,0.109288,0.175768,0.146483,0.06497,0.053394
8,10,"In Asian countries, the custom of “home leavin...",in asian country custom home leaving common us...,0.084623,0.209328,0.119233,0.062703,0.058027,0.466086
9,11,"It took me a long time to read this book, this...",it took long time read book beautiful book exc...,0.060124,0.197267,0.522186,0.097397,0.072099,0.050928
