# Filtering nouns

In [1]:
import pandas as pd

In [2]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [3]:
# df0 = pd.read_pickle('../data/interim/004_synonyms_grouped_1k.p')
df0 = pd.read_pickle('../data/interim/002_keyed_nouns.p')

In [4]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [5]:
dictionary_df00 = pd.read_pickle('../data/interim/003_dictionary.p')

In [6]:
len(dictionary_df00)

822604

In [7]:
dictionary_df00.head()

Unnamed: 0,word,frequency
0,book,1502803
1,one,639620
2,read,467228
3,like,386404
4,story,365799


### The idea
Words that only appear once cannot be frequent words even in their own context; so they will be filtered out. Then lets calculate the average frequency for the remaining words--remember; this dictionary does not only concern nouns.

<span style="color:red"> Notice: grouping of noun synonyms done in `004_grouping_domain_synonyms` is repeated here once filtering out nouns is applied, since it will take far less time to be applied on the whole dataset once the latter is filter (`004_grouping_domain_synonyms` was aplied only on 1k reviews)  </span>

In [8]:
dictionary_df00.loc[dictionary_df00['frequency'] > 5].describe()

Unnamed: 0,frequency
count,155054.0
mean,539.497
std,6586.737
min,6.0
25%,10.0
50%,22.0
75%,91.0
max,1502803.0


In [9]:
dictionary_df00['word'].loc[dictionary_df00['frequency'] > 4].count()

172284

In [10]:
gt4_dictionary_df01 = dictionary_df00.loc[dictionary_df00['frequency'] > 4]

In [11]:
dictionary_df00['frequency'].loc[dictionary_df00['frequency'] > 4].describe()

count    1.722840e+05
mean     4.860424e+02
std      6.250750e+03
min      5.000000e+00
25%      8.000000e+00
50%      1.800000e+01
75%      7.400000e+01
max      1.502803e+06
Name: frequency, dtype: float64

In [12]:
# Use threshold for first quantile
final_dic = gt4_dictionary_df01.loc[dictionary_df00['frequency'] < 8]
len(final_dic)

39890

In [13]:
final_dic_df01 = final_dic.assign(normalised = final_dic['frequency'].progress_apply(lambda frequency:frequency/486))
final_dic_df01.head()

Progress:: 100%|██████████| 39890/39890 [00:00<00:00, 1326705.15it/s]


Unnamed: 0,word,frequency,normalised
132394,wordlessness,7,0.014403
132395,ciasponsored,7,0.014403
132396,sophieannes,7,0.014403
132397,traster,7,0.014403
132398,tedlock,7,0.014403


### Begin noun filtering

In [14]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [15]:
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()

Unnamed: 0,userId,asin
0,A2XQ5LZHTD4AFT,000100039X
1,AF7CSSGV93RXN,000100039X
2,A1NPNGWBVD9AK3,000100039X
3,A3IS4WGMFR4X65,000100039X
4,AWLFVCT9128JV,000100039X


In [16]:
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()

Unnamed: 0,reviewText
0,"[timeless, gibran, backs, content, means, ..."
1,"[ prophet, kahlil, gibran, thirty, years, ..."
2,"[ first, books, recall, collection, gibran..."
3,"[prophet, kahlil, work, world, million, c..."
4,"[gibran, khalil, gibran, born, one thousan..."


In [17]:
df_new = pd.concat([df1, df_reviewText], axis=1)
df_new.head()

Unnamed: 0,userId,asin,reviewText
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [18]:
df_new_01 = df_new.assign(wordCountBefore = df_new['reviewText'].progress_apply(lambda review:len(review)))
df_new_01.head()

Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1217178.22it/s]


Unnamed: 0,userId,asin,reviewText,wordCountBefore
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ...",49
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ...",19
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran...",76
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c...",142
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan...",48


In [19]:
final_dic_df01['word'] = final_dic_df01['word'].progress_apply(lambda word: word.replace(" ",""))
final_dic_df01 = final_dic_df01.reset_index()
final_dic_df01.head()

Progress:: 100%|██████████| 39890/39890 [00:00<00:00, 1211063.08it/s]


Unnamed: 0,index,word,frequency,normalised
0,132394,wordlessness,7,0.014403
1,132395,ciasponsored,7,0.014403
2,132396,sophieannes,7,0.014403
3,132397,traster,7,0.014403
4,132398,tedlock,7,0.014403


In [20]:
filtered_dict = final_dic_df01['word'].to_dict()
inv_filtered_dict = {v: k for k, v in filtered_dict.items()}
inv_filtered_dict

{'wordlessness': 0,
 'ciasponsored': 1,
 'sophieannes': 2,
 'traster': 3,
 'tedlock': 4,
 'pestiferous': 5,
 'himselfas': 6,
 'shigeko': 7,
 'poe': 8,
 'aureus': 9,
 'easiertoread': 10,
 'joyrides': 11,
 'simmis': 12,
 '2014genres': 13,
 'pigafetta': 14,
 'wyss': 15,
 'psychodelic': 16,
 'schoool': 17,
 'hjelms': 18,
 'boadt': 19,
 'savona': 20,
 'bettany': 21,
 'teached': 22,
 'pageandahalf': 23,
 'pinch': 24,
 'policyby': 25,
 'usagainstthem': 26,
 'oompaloompas': 27,
 'vitually': 28,
 'buhle': 29,
 'lims': 30,
 'welltitled': 31,
 'costcos': 32,
 'rabbithole': 33,
 'whalens': 34,
 'infomration': 35,
 'rizzolli': 36,
 'laughingdog': 37,
 'gloomies': 38,
 'mugwort': 39,
 'lovescenes': 40,
 'throughit': 41,
 'agress': 42,
 'wellsubstantiated': 43,
 'esbat': 44,
 'sothat': 45,
 'celierian': 46,
 'harlequinjunkie': 47,
 'wisconsinmadison': 48,
 'mandatory': 49,
 'rezzians': 50,
 'sheks': 51,
 'spearman': 52,
 'latisha': 53,
 'pssst': 54,
 'meiss': 55,
 'flutterings': 56,
 'sympton': 57,
 

In [21]:
def filter_words(review):
    new_review = []
    for word in review:
        word = word.strip()
        if word in inv_filtered_dict:
            new_review.append(word)
    return new_review

In [22]:
df_new_02 = df_new_01.assign(filteredText = df_new_01['reviewText'].progress_apply(lambda review:filter_words(review)))

Progress:: 100%|██████████| 582711/582711 [00:10<00:00, 57569.25it/s]


In [23]:
df_new_03 = df_new_02.assign(wordCountAfter = df_new_02['filteredText'].progress_apply(lambda review:len(review)))
df_new_03[0:20]

Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1287602.10it/s]


Unnamed: 0,userId,asin,reviewText,wordCountBefore,filteredText,wordCountAfter
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ...",49,"[messege, sermon, prophets, flows]",4
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ...",19,[],0
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran...",76,"[catechism, texts, siddhartha, preachers, prop...",8
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c...",142,"[claude, mastery, biographers]",3
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan...",48,[almustafa],1
5,AFY0BT42DDYZV,000100039X,"[days, gibrans, gets, literature, yet, bo...",177,"[profits, twentysix, sage, metaphors]",4
6,A25P6DY6ARTCGZ,000100039X,"[book, gibran, took, millions, encapsulate...",29,[],0
7,A1SP45I55GQIIE,000100039X,"[ words, kahlil, gibran, divine, wisdom, ...",35,[meanings],1
8,A2E71VWXO59342,000100039X,"[prophet, dispenses, wisdom, ones, bids, ...",29,[],0
9,A2OP1HD9RGX5OW,000100039X,"[book, myth, work, beauty, whose, every, ...",42,"[simplicity, relies]",2


In [24]:
remaining = 1 - df_new_03['wordCountAfter'].sum() / df_new_03['wordCountBefore'].sum()

In [25]:
print("Average noun reduction achieved:" + str(remaining*100) + "%")

Average noun reduction achieved:95.95373520483005%


## Association Rules Mining Filtering

In [26]:
df_books_bigReviews = pd.DataFrame(df_new_03[['asin','filteredText']].groupby(['asin'])['filteredText'].progress_apply(list))
df_books_bigReviews = df_books_bigReviews.reset_index()
df_books_bigReviews = df_books_bigReviews.assign(transactions = df_books_bigReviews['filteredText'].progress_apply(lambda reviews_lis:len(reviews_lis)))
df_books_bigReviews.head()

Progress:: 100%|█████████▉| 59324/59325 [00:02<00:00, 27837.71it/s]
Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1308829.53it/s]


Unnamed: 0,asin,filteredText,transactions
0,000100039X,"[[messege, sermon, prophets, flows], [], [cate...",30
1,0002051850,"[[periods, progresses, usage, thee, virtues, a...",31
2,0002113570,"[[], [continues, usfor, continues], [behavior]...",7
3,0002117088,"[[surgery, goodnight, claude, claude, sorts, t...",5
4,000215725X,"[[], [], [fraser, fraser, perpetual, fraser, f...",11


In [27]:
from apyori import apriori

# Support
# Support is an indication of how frequently the itemset appears in the dataset.
# Confidence
# Confidence is an indication of how often the rule has been found to be true.
# Lift
# The ratio of the observed support to that expected if X and Y were independent.
def apply_arm(transactions):
    return list(apriori(transactions, min_support = 1/len(transactions), min_confidence = 1, min_lift = len(transactions), max_length = 4))

In [28]:
books_with_arm = df_books_bigReviews.assign(arm = df_books_bigReviews['filteredText'].progress_apply(lambda list_of_reviews:apply_arm(list_of_reviews)))
books_with_arm.head()

Progress:: 100%|██████████| 59324/59324 [5:25:02<00:00,  3.04it/s]     


Unnamed: 0,asin,filteredText,transactions,arm
0,000100039X,"[[messege, sermon, prophets, flows], [], [cate...",30,"[((speaker, arabic), 0.03333333333333333, [Ord..."
1,0002051850,"[[periods, progresses, usage, thee, virtues, a...",31,"[((19yearolds, muck), 0.03225806451612903, [Or..."
2,0002113570,"[[], [continues, usfor, continues], [behavior]...",7,"[((homo, ancestors), 0.14285714285714285, [Ord..."
3,0002117088,"[[surgery, goodnight, claude, claude, sorts, t...",5,"[((goodnight, claude), 0.2, [OrderedStatistic(..."
4,000215725X,"[[], [], [fraser, fraser, perpetual, fraser, f...",11,"[((17th, colony), 0.09090909090909091, [Ordere..."


In [29]:
def get_important_nouns(arms):
    imp_nns = []
    if "items" in pd.DataFrame(arms).keys():
        results = list(pd.DataFrame(arms)['items'])
        for result in results:
            if len(list(result)) > 4:
                imp_nns = imp_nns + list(list(result))
        if(len(imp_nns)==0):
            for result in results:
                if len(list(result)) > 3:
                    imp_nns = imp_nns + list(list(result))            
        return list(set(imp_nns))
    return list(set(imp_nns))

In [30]:
imp_nns_df = books_with_arm.assign(imp_nns = books_with_arm['arm']
                                   .progress_apply(lambda arms:get_important_nouns(arms)))
imp_nns_df.head()

Progress:: 100%|██████████| 59324/59324 [13:34:44<00:00,  1.21it/s]      


Unnamed: 0,asin,filteredText,transactions,arm,imp_nns
0,000100039X,"[[messege, sermon, prophets, flows], [], [cate...",30,"[((speaker, arabic), 0.03333333333333333, [Ord...","[kneads, profits, preachers, territory, exile,..."
1,0002051850,"[[periods, progresses, usage, thee, virtues, a...",31,"[((19yearolds, muck), 0.03225806451612903, [Or...","[declarations, towns, smaller, threatens, desi..."
2,0002113570,"[[], [continues, usfor, continues], [behavior]...",7,"[((homo, ancestors), 0.14285714285714285, [Ord...","[humane, homo, ancestors, michener]"
3,0002117088,"[[surgery, goodnight, claude, claude, sorts, t...",5,"[((goodnight, claude), 0.2, [OrderedStatistic(...","[surgery, sorts, goodnight, virtues, translato..."
4,000215725X,"[[], [], [fraser, fraser, perpetual, fraser, f...",11,"[((17th, colony), 0.09090909090909091, [Ordere...","[treachery, fort, emperors, 17th, uk, mundane,..."


In [31]:
imp_nns_df = imp_nns_df[['asin','imp_nns']]
imp_nns_df.head()

Unnamed: 0,asin,imp_nns
0,000100039X,"[kneads, profits, preachers, territory, exile,..."
1,0002051850,"[declarations, towns, smaller, threatens, desi..."
2,0002113570,"[humane, homo, ancestors, michener]"
3,0002117088,"[surgery, sorts, goodnight, virtues, translato..."
4,000215725X,"[treachery, fort, emperors, 17th, uk, mundane,..."


In [32]:
imp_nns_df.to_pickle("../data/interim/005_important_nouns.p")

In [33]:
imp_nns_df = imp_nns_df.assign(num_of_imp_nouns = imp_nns_df['imp_nns'].progress_apply(lambda imp_nouns:len(imp_nouns)))
imp_nns_df.head()

Progress:: 100%|██████████| 59324/59324 [00:00<00:00, 1183158.14it/s]


Unnamed: 0,asin,imp_nns,num_of_imp_nouns
0,000100039X,"[kneads, profits, preachers, territory, exile,...",26
1,0002051850,"[declarations, towns, smaller, threatens, desi...",73
2,0002113570,"[humane, homo, ancestors, michener]",4
3,0002117088,"[surgery, sorts, goodnight, virtues, translato...",7
4,000215725X,"[treachery, fort, emperors, 17th, uk, mundane,...",39


## Some more stats

In [34]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [36]:
# Filter out synonyms again

In [38]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] == 0]
len(booksWithNoImportantNouns)

10385

In [39]:
booksWithNoImportantNouns = imp_nns_df.loc[imp_nns_df['num_of_imp_nouns'] != 0]
len(booksWithNoImportantNouns)

48939

In [41]:
booksWithNoImportantNouns[0:20]

Unnamed: 0,asin,imp_nns,num_of_imp_nouns
0,000100039X,"[kneads, profits, preachers, territory, exile,...",26
1,0002051850,"[declarations, towns, smaller, threatens, desi...",73
2,0002113570,"[humane, homo, ancestors, michener]",4
3,0002117088,"[surgery, sorts, goodnight, virtues, translato...",7
4,000215725X,"[treachery, fort, emperors, 17th, uk, mundane,...",39
5,0002219417,"[humanlevel, smaller, conversion, periods, lic...",32
6,000222383X,"[treasons, construct, expansion, captains, fav...",11
7,0002226618,"[coward, towering, territory, papers, macdonal...",23
8,000224053X,"[fundamentalists, coast, pioneer, inconsistenc...",81
9,0002242052,"[stretches, authorities, ludlum, drugdealers, ...",14


In [42]:
booksWithNoImportantNouns['num_of_imp_nouns'].iplot(kind='histogram', bins=100, xTitle='Number of Important Nouns', yTitle='Number of Books')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points


