# Excerpts Extraction 

In [1]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [3]:
import pandas as pd
import numpy as np
import nltk

In [4]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [9]:
reviews_and_ratings_df = pd.read_pickle('../data/interim/001_pre_processed_reviews+and_ratings.p')
reviews_and_ratings_df.head()

Unnamed: 0,reviewerID,asin,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,5.0
1,AF7CSSGV93RXN,000100039X,I first read The Prophet by Kahlil Gibran over...,5.0
2,A1NPNGWBVD9AK3,000100039X,This is one of the first (literary) books I re...,5.0
3,A3IS4WGMFR4X65,000100039X,The Prophet is Kahlil Gibran's best known work...,5.0
4,AWLFVCT9128JV,000100039X,Gibran Khalil Gibran was born in 1883 in what ...,5.0


In [10]:
reviews_vs_feature_opinion_pairs = pd.read_pickle("../data/interim/006_pairs_per_review.p")

In [11]:
reviews_vs_feature_opinion_pairs.head()

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs,num_of_pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]",2
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t...",4
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]",1
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (...",3
13,A2ZZHMT58ZMVCZ,000100039X,"[(prophet, NN), ( waited, VBD), ( twelve, CD),...","[kneads, profits, preachers, territory, exile,...",26,"[(bear, departs), (others, pillars), (similar,...",4


In [13]:
df00 = reviews_vs_feature_opinion_pairs[['userId','asin','pairs']]
df00.columns = ['reviewerID','asin','pairs']
df00.head()

Unnamed: 0,reviewerID,asin,pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]"
2,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t..."
4,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]"
5,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (..."
13,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,..."


In [14]:
df01 = df00.merge(reviews_and_ratings_df, left_on=['reviewerID','asin'], right_on=['reviewerID','asin'], how='inner')
df01[0:31]

Unnamed: 0,reviewerID,asin,pairs,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]",A timeless classic. It is a very demanding an...,5.0
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",This is one of the first (literary) books I re...,5.0
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",Gibran Khalil Gibran was born in 1883 in what ...,5.0
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","These days, Kahlil Gibran's ""The Prophet"" ofte...",5.0
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",A prophet has waited twelve years in a coastal...,5.0
5,ADIDQRLLR4KBQ,000100039X,"[(beautiful, metaphors), (live, prophets)]","Being an Atheist, it may seem strange to some ...",5.0
6,A281NPSIMI1C2R,000100039X,"[(pain, waves), (separate, almustafa)]","I am alive like you, and I am standing beside ...",5.0
7,A2R64CR74I98K3,000100039X,"[(religious, texts)]",This is a very usefull book that can be used a...,5.0
8,AF4QKY2R2TD3U,000100039X,"[(rich, metaphors)]","""Say not, 'I have found the truth,' but rather...",5.0
9,A3SMT15X2QVUR8,000100039X,"[(orphalese, metaphor)]",The Prophet Almustafa waits in the city of Orp...,5.0


### Break reviews to their composing sentences

In [19]:
df01['reviewText'] = df01['reviewText'].progress_apply(lambda review: sent_tokenize(review))
df01.head()

Progress:: 100%|██████████| 249871/249871 [02:08<00:00, 1944.77it/s]


Unnamed: 0,reviewerID,asin,pairs,reviewText,overall
0,A2XQ5LZHTD4AFT,000100039X,"[(birth, prophets), (book, flows)]","[A timeless classic., It is a very demanding a...",5.0
1,A1NPNGWBVD9AK3,000100039X,"[(relevant, catechism), (within, prophets), (t...",[This is one of the first (literary) books I r...,5.0
2,AWLFVCT9128JV,000100039X,"[(forty-eight, almustafa)]",[Gibran Khalil Gibran was born in 1883 in what...,5.0
3,AFY0BT42DDYZV,000100039X,"[(souls, profits), (wordofmouth, twentysix), (...","[These days, Kahlil Gibran's ""The Prophet"" oft...",5.0
4,A2ZZHMT58ZMVCZ,000100039X,"[(bear, departs), (others, pillars), (similar,...",[A prophet has waited twelve years in a coasta...,5.0
