# Excerpts Extraction 

In [1]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [2]:
import pandas as pd
import numpy as np
import nltk

In [3]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [4]:
reviews_and_ratings_df = pd.read_pickle('../data/interim/001_pre_processed_reviews+and_ratings.p')
reviews_and_ratings_df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,5.0
1,AF7CSSGV93RXN,000100039X,I first read The Prophet by Kahlil Gibran over...,5.0
2,A1NPNGWBVD9AK3,000100039X,This is one of the first (literary) books I re...,5.0
3,A3IS4WGMFR4X65,000100039X,The Prophet is Kahlil Gibran's best known work...,5.0
4,AWLFVCT9128JV,000100039X,Gibran Khalil Gibran was born in 1883 in what ...,5.0


In [5]:
reviews_vs_feature_opinion_pairs = pd.read_pickle("../data/interim/006_pairs_per_review.p")

In [6]:
reviews_vs_feature_opinion_pairs.head()

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs,num_of_pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]",2
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t...",4
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]",1
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (...",3
13,A2ZZHMT58ZMVCZ,000100039X,"[(prophet, NN), ( waited, VBD), ( twelve, CD),...","[kneads, profits, preachers, territory, exile,...",26,"[(bear, departs), (others, pillars), (similar,...",4


In [7]:
df00 = reviews_vs_feature_opinion_pairs[['userId','asin','pairs']]
df00.columns = ['reviewerID','asin','pairs']
df00.head()

Unnamed: 0,reviewerID,asin,pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]"
2,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t..."
4,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]"
5,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (..."
13,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,..."


In [8]:
df01 = df00.merge(reviews_and_ratings_df, left_on=['reviewerID','asin'], right_on=['reviewerID','asin'], how='inner')
df01[0:31]

Unnamed: 0,reviewerID,asin,pairs,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]",A timeless classic. It is a very demanding an...,5.0
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",This is one of the first (literary) books I re...,5.0
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",Gibran Khalil Gibran was born in 1883 in what ...,5.0
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","These days, Kahlil Gibran's ""The Prophet"" ofte...",5.0
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",A prophet has waited twelve years in a coastal...,5.0
5,ADIDQRLLR4KBQ,000100039X,"[(beautiful, metaphors), (live, prophets)]","Being an Atheist, it may seem strange to some ...",5.0
6,A281NPSIMI1C2R,000100039X,"[(pain, waves), (separate, almustafa)]","I am alive like you, and I am standing beside ...",5.0
7,A2R64CR74I98K3,000100039X,"[(religious, texts)]",This is a very usefull book that can be used a...,5.0
8,AF4QKY2R2TD3U,000100039X,"[(rich, metaphors)]","""Say not, 'I have found the truth,' but rather...",5.0
9,A3SMT15X2QVUR8,000100039X,"[(orphalese, metaphor)]",The Prophet Almustafa waits in the city of Orp...,5.0


### Break reviews to their composing sentences

In [9]:
from nltk.tokenize import sent_tokenize
df01['reviewText'] = df01['reviewText'].progress_apply(lambda review: sent_tokenize(review))
df01.head()

Progress:: 100%|██████████| 249871/249871 [02:07<00:00, 1957.10it/s]


Unnamed: 0,reviewerID,asin,pairs,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]","[A timeless classic., It is a very demanding a...",5.0
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",[This is one of the first (literary) books I r...,5.0
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",[Gibran Khalil Gibran was born in 1883 in what...,5.0
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","[These days, Kahlil Gibran's ""The Prophet"" oft...",5.0
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",[A prophet has waited twelve years in a coasta...,5.0


After identifying the distinct sentences, next we need to apply the same normalisation process we employed at the beggining of this project, but this time on each sentence rather than on reviews.

In [10]:
# Word Tokenize
import re
import string
import inflect
from nltk.corpus import wordnet
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import regexp_tokenize
tokenizer=RegexpTokenizer('[\'\w\-]+',gaps=False)

# Convert to Lowercase
def convert_to_lowercase(sentence):

    for i in range(len(sentence)):
        sentence[i] = sentence[i].lower()
    return sentence

# Eliminate Punctuation
def eliminate_punctuation(sentence, regex):
    new_sentence = []
    for token in sentence:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_sentence.append(new_token)
    return new_sentence

r1 = re.compile("([a-zA-Z]+)([0-9]+)")
r2 = re.compile("([0-9]+)([a-zA-Z]+)")
r3 = re.compile("([a-zA-Z]+)([0-9]+)([a-zA-Z]+)")
r4 = re.compile("([0-9]+)([a-zA-Z]+)([0-9]+)")

# Split words into numbers
def split_words_and_nums(sentence):
    new_sentence = []
    for token in sentence:
        firstRegexIsTrue = r1.match(token)
        secondRegexIsTrue = r2.match(token)
        thirdRegexIsTrue = r3.match(token)
        fourthRegexIsTrue = r4.match(token)
    
        if(firstRegexIsTrue):
            new_sentence.append(firstRegexIsTrue.group(0))
            new_sentence.append(firstRegexIsTrue.group(1))
        elif(firstRegexIsTrue):
            new_sentence.append(secondRegexIsTrue.group(0))
            new_sentence.append(secondRegexIsTrue.group(1))
        elif(thirdRegexIsTrue):
            new_sentence.append(thirdRegexIsTrue.group(0))
            new_sentence.append(thirdRegexIsTrue.group(1))
            new_sentence.append(thirdRegexIsTrue.group(2))
        elif(fourthRegexIsTrue):
            new_sentence.append(fourthRegexIsTrue.group(0))
            new_sentence.append(fourthRegexIsTrue.group(1))
            new_sentence.append(fourthRegexIsTrue.group(2))
        else:
            new_sentence.append(token)
    return new_sentence

## Convert Numbers to Words
def numStringToWord(sentence, p):        
    for i in range(len(sentence)):
        if(sentence[i].isdigit()):
            if(len(sentence[i])<10):
                sentence[i] = p.number_to_words(sentence[i])
    return sentence

# Replace negatives with antonyms 
class AntonymReplacer(object):
    def replace(self, token, pos=None):
        antonyms = set()
        for syn in wordnet.synsets(token, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())
        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None
        
    def replace_negations(self, sentence):
        i, l = 0, len(sentence)
        tokens = []
        while i<l:
            token = sentence[i]
            if token == 'not' and i+1 <l:
                ant = self.replace(sentence[i+1])
                if ant:
                    tokens.append(ant)
                    i += 2
                    continue
            tokens.append(token)
            i += 1

        return tokens

In [11]:
replacer = AntonymReplacer()
regex=re.compile('[%s]' % re.escape(string.punctuation))
p = inflect.engine()
def normalise_and_tokenize_sentences(review):
    new_review = []
    for sentence in review:
        step_0 = tokenizer.tokenize(sentence)
        step_1 = convert_to_lowercase(step_0)
        step_2 = eliminate_punctuation(step_1, regex)
        step_3 = split_words_and_nums(step_2)
        step_4 = numStringToWord(step_3, p)
        step_5 = replacer.replace_negations(step_4)
        new_review.append(step_5)
    
    return new_review

In [12]:
df2 = df01.assign(norm_sentences = df01['reviewText'].progress_apply(lambda reviewText:normalise_and_tokenize_sentences(reviewText)))
df2.head()

Progress:: 100%|██████████| 249871/249871 [04:22<00:00, 950.77it/s] 


Unnamed: 0,reviewerID,asin,pairs,reviewText,overall,norm_sentences
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]","[A timeless classic., It is a very demanding a...",5.0,"[[a, timeless, classic], [it, is, a, very, dem..."
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",[This is one of the first (literary) books I r...,5.0,"[[this, is, one, of, the, first, literary, boo..."
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",[Gibran Khalil Gibran was born in 1883 in what...,5.0,"[[gibran, khalil, gibran, was, born, in, one t..."
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","[These days, Kahlil Gibran's ""The Prophet"" oft...",5.0,"[[these, days, kahlil, gibrans, the, prophet, ..."
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",[A prophet has waited twelve years in a coasta...,5.0,"[[a, prophet, has, waited, twelve, years, in, ..."


In [13]:
df2.to_pickle('../data/interim/007_pre_processed_dataset_for_excerpts_extraction.p')

## Begin Excerpt Extraction

In [14]:
matrix_m01 = df2.as_matrix()

In [15]:
matrix_m02 = np.append(matrix_m01,np.zeros([len(matrix_m01),1]),1)
sample = pd.DataFrame(matrix_m02[0:10])
sample

Unnamed: 0,0,1,2,3,4,5,6
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]","[A timeless classic., It is a very demanding a...",5,"[[a, timeless, classic], [it, is, a, very, dem...",0
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",[This is one of the first (literary) books I r...,5,"[[this, is, one, of, the, first, literary, boo...",0
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",[Gibran Khalil Gibran was born in 1883 in what...,5,"[[gibran, khalil, gibran, was, born, in, one t...",0
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","[These days, Kahlil Gibran's ""The Prophet"" oft...",5,"[[these, days, kahlil, gibrans, the, prophet, ...",0
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",[A prophet has waited twelve years in a coasta...,5,"[[a, prophet, has, waited, twelve, years, in, ...",0
5,ADIDQRLLR4KBQ,000100039X,"[(beautiful, metaphors), (live, prophets)]","[Being an Atheist, it may seem strange to some...",5,"[[being, an, atheist, it, may, seem, strange, ...",0
6,A281NPSIMI1C2R,000100039X,"[(pain, waves), (separate, almustafa)]","[I am alive like you, and I am standing beside...",5,"[[i, am, alive, like, you, and, i, am, standin...",0
7,A2R64CR74I98K3,000100039X,"[(religious, texts)]",[This is a very usefull book that can be used ...,5,"[[this, is, a, very, usefull, book, that, can,...",0
8,AF4QKY2R2TD3U,000100039X,"[(rich, metaphors)]","[""Say not, 'I have found the truth,' but rathe...",5,"[[say, not, i, have, found, the, truth, but, r...",0
9,A3SMT15X2QVUR8,000100039X,"[(orphalese, metaphor)]",[The Prophet Almustafa waits in the city of Or...,5,"[[the, prophet, almustafa, waits, in, the, cit...",0


In [16]:
def identify_excerpt_index_for(review_sentences, pair):
    index = None
    for i in range(len(review_sentences)):
        sentence = review_sentences[i]
        if pair[0] in sentence:
            if pair[1] in sentence:
                index = i
                break
    return index

In [17]:
from tqdm import tqdm

with tqdm(total=len(matrix_m02)) as pbar:
    for i in range(len(matrix_m02)):
        excerpt_indices = []
        actual_sentences = matrix_m02[i][3]
        review_sentences = matrix_m02[i][5]
        pairs = matrix_m02[i][2]
        
        for pair in pairs:
            index_of_sentence_with_pair = identify_excerpt_index_for(review_sentences,pair)
            
            if index_of_sentence_with_pair is not None and index_of_sentence_with_pair not in excerpt_indices:
                excerpt_indices.append(index_of_sentence_with_pair)
    
        excerpts = []
        for index in excerpt_indices:
            excerpts.append(actual_sentences[index])
    
        matrix_m02[i][6] = excerpts
    
        pbar.update(1)

100%|██████████| 249871/249871 [00:06<00:00, 37508.66it/s]


In [18]:
df20 = pd.DataFrame(matrix_m02)
df20.columns = ['reviewerID','asin','pairs','reviewText','overall','norm_sentences','excerpts']
df20.head()

Unnamed: 0,reviewerID,asin,pairs,reviewText,overall,norm_sentences,excerpts
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]","[A timeless classic., It is a very demanding a...",5,"[[a, timeless, classic], [it, is, a, very, dem...","[There is much that hints at his birth place, ..."
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",[This is one of the first (literary) books I r...,5,"[[this, is, one, of, the, first, literary, boo...",[I believe that was my first taste of spiritua...
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",[Gibran Khalil Gibran was born in 1883 in what...,5,"[[gibran, khalil, gibran, was, born, in, one t...",[He died of cancer in a New York hospital at t...
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","[These days, Kahlil Gibran's ""The Prophet"" oft...",5,"[[these, days, kahlil, gibrans, the, prophet, ...","[There is no political, religious, or commerci..."
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",[A prophet has waited twelve years in a coasta...,5,"[[a, prophet, has, waited, twelve, years, in, ...",[A local seeress who knows him best asks him t...


In [19]:
df30 = df20[['reviewerID','asin','overall','excerpts']]
df30.head()

Unnamed: 0,reviewerID,asin,overall,excerpts
0,A2XQ5LZHTD4AFT,000100039X,5,"[There is much that hints at his birth place, ..."
1,A1NPNGWBVD9AK3,000100039X,5,[I believe that was my first taste of spiritua...
2,AWLFVCT9128JV,000100039X,5,[He died of cancer in a New York hospital at t...
3,AFY0BT42DDYZV,000100039X,5,"[There is no political, religious, or commerci..."
4,A2ZZHMT58ZMVCZ,000100039X,5,[A local seeress who knows him best asks him t...


In [20]:
len(df30)

249871

In [21]:
df31 = df30[df30['excerpts'].map(lambda excerpts: len(excerpts)) > 0]
len(df31)

231936

In [25]:
231936/249871

0.9282229630489333

In [26]:
249871 - 231936

17935

## Get Polarity of Excerpts

In [22]:
import numpy as np
from textblob import TextBlob

def get_overal_polarity(excerpts):
    text = ''.join(excerpts)
    blob = TextBlob(text)
    
    polarity = []
    for sentence in blob.sentences:
        polarity.append(sentence.sentiment.polarity)

    return np.mean(polarity)

In [23]:
df40 = df31.assign(polarity = df31['excerpts'].progress_apply(lambda excerpts:get_overal_polarity(excerpts)))
df40.head()

Progress:: 100%|██████████| 231936/231936 [03:40<00:00, 1052.48it/s]


Unnamed: 0,reviewerID,asin,overall,excerpts,polarity
0,A2XQ5LZHTD4AFT,000100039X,5,"[There is much that hints at his birth place, ...",0.332292
1,A1NPNGWBVD9AK3,000100039X,5,[I believe that was my first taste of spiritua...,0.425
2,AWLFVCT9128JV,000100039X,5,[He died of cancer in a New York hospital at t...,0.133182
3,AFY0BT42DDYZV,000100039X,5,"[There is no political, religious, or commerci...",0.155729
4,A2ZZHMT58ZMVCZ,000100039X,5,[A local seeress who knows him best asks him t...,0.09658


In [24]:
df40.to_pickle('../data/interim/007_excerpts_with_polarity.p')

## Produce Summaries

In [27]:
def merge_list(summariesList):
    summary = []
    for excerpt in summariesList:
        summary = summary + excerpt
    return summary

In [31]:
df_book_summaries = pd.DataFrame(df40.groupby(['asin'])['excerpts'].progress_apply(list)).reset_index()
df_book_summaries.head()

Progress:: 100%|█████████▉| 48693/48694 [00:01<00:00, 28126.36it/s]


Unnamed: 0,asin,excerpts
0,000100039X,"[[There is much that hints at his birth place,..."
1,0002051850,"[[However, as the story progresses, Hemingway'..."
2,0002113570,[[That an English woman scientist would journe...
3,0002117088,"[[We adopted &quot;Renoir, My Father&quot; as ..."
4,000215725X,[[William and Olivia stayed in the Fraser resi...


In [32]:
df_book_summaries['excerpts'] = df_book_summaries['excerpts'].progress_apply(lambda summariesList: merge_list(summariesList))
df_book_summaries.head()

Progress:: 100%|██████████| 48693/48693 [00:00<00:00, 388636.69it/s]


Unnamed: 0,asin,excerpts
0,000100039X,"[There is much that hints at his birth place, ..."
1,0002051850,"[However, as the story progresses, Hemingway's..."
2,0002113570,[That an English woman scientist would journey...
3,0002117088,"[We adopted &quot;Renoir, My Father&quot; as b..."
4,000215725X,[William and Olivia stayed in the Fraser resid...


## Evaluation

In [37]:
df40["overall"] = pd.to_numeric(df40["overall"], errors='coerce')
df40["polarity"] = pd.to_numeric(df40["polarity"], errors='coerce')
df40.head()

Unnamed: 0,reviewerID,asin,overall,excerpts,polarity
0,A2XQ5LZHTD4AFT,000100039X,5.0,"[There is much that hints at his birth place, ...",0.332292
1,A1NPNGWBVD9AK3,000100039X,5.0,[I believe that was my first taste of spiritua...,0.425
2,AWLFVCT9128JV,000100039X,5.0,[He died of cancer in a New York hospital at t...,0.133182
3,AFY0BT42DDYZV,000100039X,5.0,"[There is no political, religious, or commerci...",0.155729
4,A2ZZHMT58ZMVCZ,000100039X,5.0,[A local seeress who knows him best asks him t...,0.09658


In [62]:
mean_rating_vs_polarity_per_book = pd.DataFrame(df40.groupby(['asin'])[["overall","polarity"]].mean()).reset_index()
mean_rating_vs_polarity_per_book.head()

Unnamed: 0,asin,overall,polarity
0,000100039X,5.0,0.217668
1,0002051850,4.357143,0.094471
2,0002113570,5.0,0.142857
3,0002117088,5.0,0.2375
4,000215725X,4.666667,0.19003


In [83]:
### Normalise polarity values to match 
def normalise(polarity):
    
    positeiv_polarity = polarity + 1
    normalised_polarity = (4 * positeiv_polarity)/2
    
    return normalised_polarity

mean_rating_vs_polarity_per_book = mean_rating_vs_polarity_per_book.assign(norm_polarity = mean_rating_vs_polarity_per_book['polarity'].progress_apply(lambda polarity:normalise(polarity)))
mean_rating_vs_polarity_per_book.head()


Progress:: 100%|██████████| 48693/48693 [00:00<00:00, 1190302.22it/s]


Unnamed: 0,asin,overall,polarity,norm_polarity
0,000100039X,5.0,0.217668,2.435336
1,0002051850,4.357143,0.094471,2.188942
2,0002113570,5.0,0.142857,2.285714
3,0002117088,5.0,0.2375,2.475
4,000215725X,4.666667,0.19003,2.38006


In [84]:
mean_rating_vs_polarity_per_book = mean_rating_vs_polarity_per_book.assign(norm_overall = mean_rating_vs_polarity_per_book['overall'].progress_apply(lambda overall:overall - 1))
mean_rating_vs_polarity_per_book.head()

Progress:: 100%|██████████| 48693/48693 [00:00<00:00, 1401930.56it/s]


Unnamed: 0,asin,overall,polarity,norm_polarity,norm_overall
0,000100039X,5.0,0.217668,2.435336,4.0
1,0002051850,4.357143,0.094471,2.188942,3.357143
2,0002113570,5.0,0.142857,2.285714,4.0
3,0002117088,5.0,0.2375,2.475,4.0
4,000215725X,4.666667,0.19003,2.38006,3.666667


In [85]:
import itertools
import numpy as np

x_ratings = np.asarray(list(itertools.chain(*mean_rating_vs_polarity_per_book.as_matrix(columns=mean_rating_vs_polarity_per_book.columns[4:5]))))
y_polarity = np.asarray(list(itertools.chain(*mean_rating_vs_polarity_per_book.as_matrix(columns=mean_rating_vs_polarity_per_book.columns[3:4]))))


In [97]:
import plotly.plotly as py
import plotly.graph_objs as go

trace1 = go.Scatter(x=x_ratings, y=y_polarity, 
                    mode='markers',
                    name='ROC curve (area = %0.2f)' % roc_auc[2]
                   )

trace2 = go.Scatter(x=[0, 4], y=[0, 4], 
                    mode='markers', 
                    line=dict(color='red', width=lw, dash='dash'),
                    showlegend=False)

layout = go.Layout(title='Receiver Operating Characteristic Function',
                   xaxis=dict(title='False Positive Rate'),
                   yaxis=dict(title='True Positive Rate'))

fig = go.Figure(data=[trace1, trace2], layout=layout)

In [96]:
py.iplot(fig)


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow



In [99]:
# Create a trace
trace = go.Scatter(
    x = x_ratings,
    y = y_polarity,
    mode = 'markers'
)

layout = go.Layout(title='Correlation between Polarity and Rating',
                   xaxis=dict(title='Ratings'),
                   yaxis=dict(title='Polarity'))

fig = go.Figure(data=[trace], layout=layout)

In [100]:
# Plot and embed in ipython notebook!
py.iplot(fig)


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




The draw time for this plot will be slow for clients without much RAM.



Estimated Draw Time Slow



In [101]:
mean_rating_vs_polarity_per_book['norm_overall'].corr(mean_rating_vs_polarity_per_book['norm_polarity'])

0.1662440592733814

In [127]:
df_book_summaries.to_csv("../data/processed/007_book_summaries.csv", sep="\t")
df_book_summaries.to_pickle("../data/processed/007_book_summaries.p")

### Some Example Summaries

In [112]:

df_book_summaries['asin'][0:1][0]

'000100039X'

![The Prophet](https://images-na.ssl-images-amazon.com/images/I/41S7%2BXDos3L._SX318_BO1,204,203,200_.jpg)

In [111]:
print(df_book_summaries['excerpts'][0:1][0])

['There is much that hints at his birth place, Lebanon where many of the old prophets walked the Earth and where this book project first germinated most likely.Probably becuase it was written in English originally, the writing flows, it is pleasant to read, and the charcoal drawings of the author decorating the pages is a plus.', 'I believe that was my first taste of spirituality and seemed  at the time more relevant than what I was being force-fed by nuns in  catechism class.', "True wisdom comes from within.The prophet's teaching on love is  particularly relevant to me at this stage of my life:&quot;For even as  love crowns you so shall he crucify you.", 'Even as he ascends to your height and caresses your  tenderest branches that quiver in the sun, So shall he descend to your  roots and shake them in their clinging to the earth.', 'He died of cancer in a New York hospital at the very young age of 48.The Prophet is a story about Almustafa (The Prophet) who after living 12 years in Or

In [119]:
df_book_summaries['asin'][1:2][1]

'0002051850'

![For Whome the Bell Tolls](https://images-na.ssl-images-amazon.com/images/I/51q-i52PejL._SX316_BO1,204,203,200_.jpg)

In [121]:
print(df_book_summaries['excerpts'][1:2][1])

['However, as the story progresses, Hemingway\'s usage of the King James-style "Thee" and "Thou" to indicate that a more formal Spanish dialect is being used becomes distractingly gimmicky and wore quite thin by the end of the book.', "What little is left is a cause whose means and ends don't seem to differ from the alternative, and an appeal to virtues of loyalty to the band, or one's responsibility to follow his duty.", "The trouble is, these appeals are made among characters who Jordan - as Hemingway's voice - often considers untrustworthy, repugnant and treacherous.", 'Another consistent theme found in Hemingway is courage under fire or dire circumstances, whether it is in the bull ring, behind enemy lines, or hunting man-eaters in the green hills of Africa.', 'The chief protagonist is an American named Robert Jordon who has been tasked to blow up a bridge behind enemy lines in the Spanish mountains.', 'Some say that Maria represents Spain and her gang rape represent the despoilage

In [122]:
df_book_summaries['asin'][2:3][2]

'0002113570'

![In the Shadow of a Man](https://images-na.ssl-images-amazon.com/images/I/51m1J%2BSclnL._SX304_BO1,204,203,200_.jpg)

In [123]:
df_book_summaries['excerpts'][2:3][2]

['That an English woman scientist would journey to Tanzania to engage in this type of research is unusual and certainly puts her at "the top of her class".She follows the lives and behavior patterns of her subjects until her research sounds like a Michener novel with its generational emphasis and timelines of family heritage.',
 'The squabbles and fighting behavior could be that of any large Homo Sapien family.',
 'Jane Goodall deserves every accolade she gets for bringing us a lens through which to observe another geneological line of a species that has developed from our common ancestors.Her work suggests that we should rethink our medical research toward more humane treatment of these animals whose behavior is  too similar to ours to ignore.']

In [124]:
df_book_summaries['asin'][3:4][3]

'0002117088'

![Renoir, My Father](https://images-na.ssl-images-amazon.com/images/I/41isN3EczFL._SX373_BO1,204,203,200_.jpg)

In [126]:
df_book_summaries['excerpts'][3:4][3]

["We adopted &quot;Renoir, My Father&quot; as bedside reading while my wife was recovering from hip surgery, and (aside, perhaps, from &quot;Goodnight, Moon,&quot;) I can't imagine better therapy.",
 'None of the rough edges have been smoothed off which, come to think of it, is just as Claude would have wanted: Jean speaks with his own voice.',
 'There is even an index of sorts (I assume from the original translator) but it is patchy and incomplete.',
 "That last is a shortcoming, but forgivable in light of the book's other virtues."]

In [132]:
df_book_summaries['asin'][6:7][6]

'000222383X'

![The Mauritius Command](https://images-na.ssl-images-amazon.com/images/I/51S6XET6X0L._AC_US436_QL65_.jpg)

In [131]:
df_book_summaries['excerpts'][6:7][6]

["The Patrick O'Brian naval series of books are an acquired taste.",
 'While the books appear on the outside to be simple naval adventure tales, they are really deep studies in character development of a British naval officer and his best friend/ship surgeon/intelligence operative.The Mauritius Command is one of the best books in the series.',
 "As is usually the case, despite great achievements in the past, Jack is shackled and insufficiently rewarded by his superiors in the admiralty, and his supposed connections, through his father in the Parliament, are of little help.O'Brian seems to assume a good bit of nautical knowledge by the reader, and this landlubber sometimes got a little lost in the naval warfare scenes.",
 "The most engaging aspects of the novel seemed to me the differences in character, and the seething one-upsmanship among the various ship captains under Jack's overall command including Captains Pym, Clonfert and Corbett.",
 "The problem was, just when the author whets

In [120]:
## END_OF_FILE