# Feature-Opinion Pairing

In [2]:
import pandas as pd

In [3]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

## Load important nouns

In [10]:
df00 = pd.read_pickle('../data/interim/005_important_nouns.p')

df00.head()

In [12]:
len(df00)

59324

In [13]:
df01 = df00.assign(num_of_imp_nouns = df00['imp_nns'].progress_apply(lambda imp_nouns:len(imp_nouns)))
df02 = df01.loc[df01['num_of_imp_nouns'] != 0]
len(df02)

Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1123729.33it/s]


48939

In [14]:
df02.head()

Unnamed: 0,asin,imp_nns,num_of_imp_nouns
0,000100039X,"[kneads, profits, preachers, territory, exile,...",26
1,0002051850,"[declarations, towns, smaller, threatens, desi...",73
2,0002113570,"[humane, homo, ancestors, michener]",4
3,0002117088,"[surgery, sorts, goodnight, virtues, translato...",7
4,000215725X,"[treachery, fort, emperors, 17th, uk, mundane,...",39


## Load book tagged reviews

In [16]:
df10 = pd.read_pickle('../data/interim/002_pos_tagged_keyed_reviews.p')

In [17]:
df10.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ..."
1,AF7CSSGV93RXN##000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ..."
2,A1NPNGWBVD9AK3##000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ..."
3,A3IS4WGMFR4X65##000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)..."
4,AWLFVCT9128JV##000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),..."


In [18]:
len(df10)

582711

In [21]:
df11 = pd.DataFrame(df10.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df11.head()

Unnamed: 0,userId,asin
0,A2XQ5LZHTD4AFT,000100039X
1,AF7CSSGV93RXN,000100039X
2,A1NPNGWBVD9AK3,000100039X
3,A3IS4WGMFR4X65,000100039X
4,AWLFVCT9128JV,000100039X


In [22]:
df_12 = pd.DataFrame(df10['reviewText'])
df_12.head()

Unnamed: 0,reviewText
0,"[(timeless, NN), ( classic, JJ), ( demanding, ..."
1,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ..."
2,"[(one, CD), ( first, NNP), ( literary, JJ), ( ..."
3,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)..."
4,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),..."


In [19]:
df_13 = pd.concat([df11, df_12], axis=1)
df_13.head()

Unnamed: 0,userId,asin,reviewText
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ..."
1,AF7CSSGV93RXN,000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ..."
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ..."
3,A3IS4WGMFR4X65,000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)..."
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),..."


## Join reviews with important nouns

In [20]:
df_joined = df_13.merge(df02, left_on='asin', right_on='asin', how='inner')
df_joined[0:31]

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26
1,AF7CSSGV93RXN,000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ...","[kneads, profits, preachers, territory, exile,...",26
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26
3,A3IS4WGMFR4X65,000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)...","[kneads, profits, preachers, territory, exile,...",26
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26
6,A25P6DY6ARTCGZ,000100039X,"[(book, NN), ( almost, RBS), ( kahlil, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26
7,A1SP45I55GQIIE,000100039X,"[(certainly, RB), ( words, NNS), ( kahlil, NNP...","[kneads, profits, preachers, territory, exile,...",26
8,A2E71VWXO59342,000100039X,"[(prophet, NN), ( dispenses, NNS), ( ultimate,...","[kneads, profits, preachers, territory, exile,...",26
9,A2OP1HD9RGX5OW,000100039X,"[(book, NN), ( poetic, JJ), ( myth, NNP), ( wo...","[kneads, profits, preachers, territory, exile,...",26


In [26]:
df_joined.describe()

Unnamed: 0,num_of_imp_nouns
count,511364.0
mean,27.590157
std,25.774587
min,4.0
25%,10.0
50%,19.0
75%,36.0
max,226.0


In [21]:
1 - 511364/582711

0.12243976859884231

In [22]:
582711-511364

71347

In [92]:
import numpy as np
matrix_m01 = df_joined.as_matrix()
len(matrix_m01)

511364

In [96]:
matrix_m02 = np.append(matrix_m01,np.zeros([len(matrix_m01),1]),1)
sample = pd.DataFrame(matrix_m02[0:10])
sample

Unnamed: 0,0,1,2,3,4,5
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,0
1,AF7CSSGV93RXN,000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ...","[kneads, profits, preachers, territory, exile,...",26,0
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,0
3,A3IS4WGMFR4X65,000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)...","[kneads, profits, preachers, territory, exile,...",26,0
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,0
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,0
6,A25P6DY6ARTCGZ,000100039X,"[(book, NN), ( almost, RBS), ( kahlil, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,0
7,A1SP45I55GQIIE,000100039X,"[(certainly, RB), ( words, NNS), ( kahlil, NNP...","[kneads, profits, preachers, territory, exile,...",26,0
8,A2E71VWXO59342,000100039X,"[(prophet, NN), ( dispenses, NNS), ( ultimate,...","[kneads, profits, preachers, territory, exile,...",26,0
9,A2OP1HD9RGX5OW,000100039X,"[(book, NN), ( poetic, JJ), ( myth, NNP), ( wo...","[kneads, profits, preachers, territory, exile,...",26,0


In [86]:
def get_pair(index, tagged_review):
    
    possible_pairs_dictionary = {}
    
    # left window
    counter = 0
    left_index = index - 1
    while((left_index!=-1) and (counter<10)):
        if tagged_review[left_index][1] in {'JJ', 'JJR', 'JJS'}:
            distance = index - left_index 
            possible_pairs_dictionary.update({tagged_review[left_index][0]:distance})
        left_index -= 1
        counter += 1

    # right window
    counter = 0
    right_index = index + 1
    while((right_index!=len(tagged_review)) and (counter<10)):
        if tagged_review[right_index][1] in {'JJ', 'JJR', 'JJS'}:
            distance = right_index - index 
            possible_pairs_dictionary.update({tagged_review[left_index][0]:distance})
        right_index += 1
        counter += 1
    
    # get shortest adj with shortest distance if multiple are found
    if(len(possible_pairs_dictionary)>1):
        return (min(possible_pairs_dictionary, key=lambda k: possible_pairs_dictionary[k]), tagged_review[index][0])
    elif(len(possible_pairs_dictionary)==1):
        return (possible_pairs_dictionary.get(0),tagged_review[index][0])
    else:
        return (None, tagged_review[index][0])

In [101]:
from tqdm import tqdm

with tqdm(total=len(matrix_m02)) as pbar:
    for i in range(len(matrix_m02)):
        pairs = []
        tagged_review = matrix_m02[i][2]
        imp_nns = matrix_m02[i][3]
        index = 0
        for(word, tag) in tagged_review:
            if tag in {'NN', 'NNS', 'NNP', 'NNPS'}:
                if word.strip() in imp_nns:
                    (adj,nn) = get_pair(index, tagged_review)
                    if adj is not None:
                        pairs.append((adj.strip(),nn.strip()))
            index += 1
        matrix_m02[i][5] = pairs
        pbar.update(1)
    

100%|██████████| 511364/511364 [00:36<00:00, 14185.95it/s]


In [102]:
sample = pd.DataFrame(matrix_m02[0:100])
sample

Unnamed: 0,0,1,2,3,4,5
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]"
1,AF7CSSGV93RXN,000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ...","[kneads, profits, preachers, territory, exile,...",26,[]
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t..."
3,A3IS4WGMFR4X65,000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)...","[kneads, profits, preachers, territory, exile,...",26,[]
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]"
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (..."
6,A25P6DY6ARTCGZ,000100039X,"[(book, NN), ( almost, RBS), ( kahlil, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,[]
7,A1SP45I55GQIIE,000100039X,"[(certainly, RB), ( words, NNS), ( kahlil, NNP...","[kneads, profits, preachers, territory, exile,...",26,[]
8,A2E71VWXO59342,000100039X,"[(prophet, NN), ( dispenses, NNS), ( ultimate,...","[kneads, profits, preachers, territory, exile,...",26,[]
9,A2OP1HD9RGX5OW,000100039X,"[(book, NN), ( poetic, JJ), ( myth, NNP), ( wo...","[kneads, profits, preachers, territory, exile,...",26,[]


In [107]:
df20 = pd.DataFrame(matrix_m02)
df20.columns = ['userId','asin','reviewText','imp_nns','num_of_imp_nouns','pairs']
df20.head()

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]"
1,AF7CSSGV93RXN,000100039X,"[(first, RB), ( read, JJ), ( prophet, NNP), ( ...","[kneads, profits, preachers, territory, exile,...",26,[]
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t..."
3,A3IS4WGMFR4X65,000100039X,"[(prophet, NN), ( kahlil, NN), ( gibrans, VBZ)...","[kneads, profits, preachers, territory, exile,...",26,[]
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]"


In [108]:
len(df20)

511364

In [110]:
reviews_vs_feature_opinion_pairs = df20[df20['pairs'].map(lambda pairs: len(pairs)) > 0]
len(reviews_vs_feature_opinion_pairs)

249871

In [112]:
reviews_vs_feature_opinion_pairs[0:100]

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]"
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t..."
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]"
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (..."
13,A2ZZHMT58ZMVCZ,000100039X,"[(prophet, NN), ( waited, VBD), ( twelve, CD),...","[kneads, profits, preachers, territory, exile,...",26,"[(bear, departs), (others, pillars), (similar,..."
16,ADIDQRLLR4KBQ,000100039X,"[(atheist, NN), ( may, NNP), ( seem, NNP), ( s...","[kneads, profits, preachers, territory, exile,...",26,"[(beautiful, metaphors), (live, prophets)]"
22,A281NPSIMI1C2R,000100039X,"[(alive, JJ), ( like, NN), ( standing, VBG), (...","[kneads, profits, preachers, territory, exile,...",26,"[(pain, waves), (separate, almustafa)]"
24,A2R64CR74I98K3,000100039X,"[(usefull, JJ), ( book, NN), ( used, VBD), ( s...","[kneads, profits, preachers, territory, exile,...",26,"[(religious, texts)]"
26,AF4QKY2R2TD3U,000100039X,"[(say, VB), ( found, IN), ( truth, NNP), ( rat...","[kneads, profits, preachers, territory, exile,...",26,"[(rich, metaphors)]"
27,A3SMT15X2QVUR8,000100039X,"[(prophet, NN), ( almustafa, CC), ( waits, NNS...","[kneads, profits, preachers, territory, exile,...",26,"[(orphalese, metaphor)]"


In [113]:
249871/511364

0.48863627474753796

In [114]:
reviews_vs_feature_opinion_pairs = reviews_vs_feature_opinion_pairs.assign(num_of_pairs = reviews_vs_feature_opinion_pairs['pairs'].progress_apply(lambda pairs:len(pairs)))
reviews_vs_feature_opinion_pairs.head()

Progress:: 100%|██████████| 249871/249871 [00:00<00:00, 1138209.59it/s]


Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs,num_of_pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]",2
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t...",4
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]",1
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (...",3
13,A2ZZHMT58ZMVCZ,000100039X,"[(prophet, NN), ( waited, VBD), ( twelve, CD),...","[kneads, profits, preachers, territory, exile,...",26,"[(bear, departs), (others, pillars), (similar,...",4


In [118]:
reviews_vs_feature_opinion_pairs[0:100]

Unnamed: 0,userId,asin,reviewText,imp_nns,num_of_imp_nouns,pairs,num_of_pairs
0,A2XQ5LZHTD4AFT,000100039X,"[(timeless, NN), ( classic, JJ), ( demanding, ...","[kneads, profits, preachers, territory, exile,...",26,"[(birth, prophets), (book, flows)]",2
2,A1NPNGWBVD9AK3,000100039X,"[(one, CD), ( first, NNP), ( literary, JJ), ( ...","[kneads, profits, preachers, territory, exile,...",26,"[(relevant, catechism), (within, prophets), (t...",4
4,AWLFVCT9128JV,000100039X,"[(gibran, NN), ( khalil, NNP), ( gibran, NNP),...","[kneads, profits, preachers, territory, exile,...",26,"[(forty-eight, almustafa)]",1
5,AFY0BT42DDYZV,000100039X,"[(days, NNS), ( kahlil, VBP), ( gibrans, NNS),...","[kneads, profits, preachers, territory, exile,...",26,"[(souls, profits), (wordofmouth, twentysix), (...",3
13,A2ZZHMT58ZMVCZ,000100039X,"[(prophet, NN), ( waited, VBD), ( twelve, CD),...","[kneads, profits, preachers, territory, exile,...",26,"[(bear, departs), (others, pillars), (similar,...",4
16,ADIDQRLLR4KBQ,000100039X,"[(atheist, NN), ( may, NNP), ( seem, NNP), ( s...","[kneads, profits, preachers, territory, exile,...",26,"[(beautiful, metaphors), (live, prophets)]",2
22,A281NPSIMI1C2R,000100039X,"[(alive, JJ), ( like, NN), ( standing, VBG), (...","[kneads, profits, preachers, territory, exile,...",26,"[(pain, waves), (separate, almustafa)]",2
24,A2R64CR74I98K3,000100039X,"[(usefull, JJ), ( book, NN), ( used, VBD), ( s...","[kneads, profits, preachers, territory, exile,...",26,"[(religious, texts)]",1
26,AF4QKY2R2TD3U,000100039X,"[(say, VB), ( found, IN), ( truth, NNP), ( rat...","[kneads, profits, preachers, territory, exile,...",26,"[(rich, metaphors)]",1
27,A3SMT15X2QVUR8,000100039X,"[(prophet, NN), ( almustafa, CC), ( waits, NNS...","[kneads, profits, preachers, territory, exile,...",26,"[(orphalese, metaphor)]",1


In [123]:
pairs_per_book = reviews_vs_feature_opinion_pairs.groupby(['asin'])[["num_of_pairs"]].sum()
pairs_per_book = pairs_per_book.reset_index()
pairs_per_book.head()

Unnamed: 0,asin,num_of_pairs
0,000100039X,22
1,0002051850,92
2,0002113570,4
3,0002117088,7
4,000215725X,46


In [124]:
len(pairs_per_book)

48853

In [125]:
48939 - 48853

86

In [126]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [127]:
pairs_per_book['num_of_pairs'].iplot(kind='histogram', bins=100, xTitle='Number of Pairs', yTitle='Number of Books')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




In [129]:
# Save data
pairs_per_book.to_pickle("../data/interim/006_pairs_per_book.p")


In [130]:
reviews_vs_feature_opinion_pairs.to_pickle("../data/interim/006_pairs_per_review.p")

In [131]:
## END_OF_FILE