# Filtering nouns

In [2]:
import pandas as pd

In [3]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

In [4]:
# df0 = pd.read_pickle('../data/interim/004_synonyms_grouped_1k.p')
df0 = pd.read_pickle('../data/interim/002_keyed_nouns.p')

In [5]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [6]:
dictionary_df00 = pd.read_pickle('../data/interim/003_dictionary.p')

In [7]:
len(dictionary_df00)

822604

In [8]:
dictionary_df00.head()

Unnamed: 0,word,frequency
0,book,1502803
1,one,639620
2,read,467228
3,like,386404
4,story,365799


### The idea
Words that only appear once cannot be frequent words even in their own context; so they will be filtered out. Then lets calculate the average frequency for the remaining words--remember; this dictionary does not only concern nouns.

<span style="color:red"> Notice: grouping of noun synonyms done in `004_grouping_domain_synonyms` is repeated here once filtering out nouns is applied, since it will take far less time to be applied on the whole dataset once the latter is filter (`004_grouping_domain_synonyms` was aplied only on 1k reviews)  </span>

In [9]:
dictionary_df00.loc[dictionary_df00['frequency'] > 5].describe()

Unnamed: 0,frequency
count,155054.0
mean,539.497
std,6586.737
min,6.0
25%,10.0
50%,22.0
75%,91.0
max,1502803.0


In [10]:
dictionary_df00['word'].loc[dictionary_df00['frequency'] > 4].count()

172284

In [11]:
gt4_dictionary_df01 = dictionary_df00.loc[dictionary_df00['frequency'] > 4]

In [12]:
dictionary_df00['frequency'].loc[dictionary_df00['frequency'] > 4].describe()

count    1.722840e+05
mean     4.860424e+02
std      6.250750e+03
min      5.000000e+00
25%      8.000000e+00
50%      1.800000e+01
75%      7.400000e+01
max      1.502803e+06
Name: frequency, dtype: float64

In [13]:
# Use threshold for first quantile
final_dic = gt4_dictionary_df01.loc[dictionary_df00['frequency'] < 9]
len(final_dic)

47864

In [14]:
final_dic_df01 = final_dic.assign(normalised = final_dic['frequency'].progress_apply(lambda frequency:frequency/486))
final_dic_df01.head()

Progress:: 100%|██████████| 47864/47864 [00:00<00:00, 1165060.10it/s]


Unnamed: 0,word,frequency,normalised
124420,culturebound,8,0.016461
124421,gilded,8,0.016461
124422,adlibbing,8,0.016461
124423,autoread,8,0.016461
124424,thenardier,8,0.016461


### Begin noun filtering

In [15]:
df0.head()

Unnamed: 0,uniqueKey,reviewText
0,A2XQ5LZHTD4AFT##000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN##000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3##000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65##000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV##000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [16]:
df1 = pd.DataFrame(df0.uniqueKey.str.split('##',1).tolist(),columns = ['userId','asin'])
df1.head()

Unnamed: 0,userId,asin
0,A2XQ5LZHTD4AFT,000100039X
1,AF7CSSGV93RXN,000100039X
2,A1NPNGWBVD9AK3,000100039X
3,A3IS4WGMFR4X65,000100039X
4,AWLFVCT9128JV,000100039X


In [17]:
df_reviewText = pd.DataFrame(df0['reviewText'])
df_reviewText.head()

Unnamed: 0,reviewText
0,"[timeless, gibran, backs, content, means, ..."
1,"[ prophet, kahlil, gibran, thirty, years, ..."
2,"[ first, books, recall, collection, gibran..."
3,"[prophet, kahlil, work, world, million, c..."
4,"[gibran, khalil, gibran, born, one thousan..."


In [18]:
df_new = pd.concat([df1, df_reviewText], axis=1)
df_new.head()

Unnamed: 0,userId,asin,reviewText
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ..."
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ..."
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran..."
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c..."
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan..."


In [19]:
df_new_01 = df_new.assign(wordCountBefore = df_new['reviewText'].progress_apply(lambda review:len(review)))
df_new_01.head()

Progress:: 100%|██████████| 582711/582711 [00:00<00:00, 1218345.00it/s]


Unnamed: 0,userId,asin,reviewText,wordCountBefore
0,A2XQ5LZHTD4AFT,000100039X,"[timeless, gibran, backs, content, means, ...",49
1,AF7CSSGV93RXN,000100039X,"[ prophet, kahlil, gibran, thirty, years, ...",19
2,A1NPNGWBVD9AK3,000100039X,"[ first, books, recall, collection, gibran...",76
3,A3IS4WGMFR4X65,000100039X,"[prophet, kahlil, work, world, million, c...",142
4,AWLFVCT9128JV,000100039X,"[gibran, khalil, gibran, born, one thousan...",48


In [20]:
final_dic_df01['word'] = final_dic_df01['word'].progress_apply(lambda word: word.replace(" ",""))
final_dic_df01 = final_dic_df01.reset_index()
final_dic_df01.head()

Progress:: 100%|██████████| 47864/47864 [00:00<00:00, 1160869.72it/s]


Unnamed: 0,index,word,frequency,normalised
0,124420,culturebound,8,0.016461
1,124421,gilded,8,0.016461
2,124422,adlibbing,8,0.016461
3,124423,autoread,8,0.016461
4,124424,thenardier,8,0.016461


In [21]:
filtered_dict = final_dic_df01['word'].to_dict()
inv_filtered_dict = {v: k for k, v in filtered_dict.items()}
inv_filtered_dict

{'culturebound': 0,
 'gilded': 1,
 'adlibbing': 2,
 'autoread': 3,
 'thenardier': 4,
 'schuhart': 5,
 'swich': 6,
 'allignment': 7,
 'sullenbergers': 8,
 'joyed': 9,
 'loveatfirst': 10,
 'bamiyan': 11,
 'mft': 12,
 'huh': 13,
 'sandford': 14,
 'nathansons': 15,
 'sairas': 16,
 'balduccis': 17,
 'showpieces': 18,
 'richesons': 19,
 'kimelman': 20,
 'mopup': 21,
 'innis': 22,
 'shimamoto': 23,
 'yolande': 24,
 'stewpot': 25,
 'tristates': 26,
 'halfbillion': 27,
 'prejudicially': 28,
 'trashbin': 29,
 'bci': 30,
 'virgie': 31,
 'richeson': 32,
 'ismaes': 33,
 'massage': 34,
 'andechs': 35,
 'haberdashers': 36,
 'wanta': 37,
 'brickwork': 38,
 'otherthe': 39,
 'bf4': 40,
 'birnes': 41,
 'phosphates': 42,
 'corners': 43,
 'forsworn': 44,
 'expertize': 45,
 'asaph': 46,
 'lky': 47,
 'dorfmans': 48,
 'ranes': 49,
 'panarab': 50,
 'bassackwards': 51,
 'doubleminded': 52,
 'milisant': 53,
 'financialneeds': 54,
 'doiron': 55,
 'eyestones': 56,
 'belov': 57,
 'aleut': 58,
 'lothrop': 59,
 'warh

In [None]:
def filter_words(review):
    new_review = []
    for word in review:
        word = word.strip()
        if word in inv_filtered_dict:
            new_review.append(word)
    return new_review

In [None]:
df_new_02 = df_new_01.assign(filteredText = df_new_01['reviewText'].progress_apply(lambda review:filter_words(review)))

Progress::  87%|████████▋ | 506106/582711 [00:09<00:01, 60786.18it/s]

In [None]:
df_new_03 = df_new_02.assign(wordCountAfter = df_new_02['filteredText'].progress_apply(lambda review:len(review)))
df_new_03

In [None]:
remaining = 1 - df_new_03['wordCountAfter'].sum() / df_new_03['wordCountBefore'].sum()

In [None]:
print("Average noun reduction achieved:" + str(remaining*100) + "%")

## Association Rules Mining Filtering

In [None]:
df_books_bigReviews = pd.DataFrame(df_new_03[['asin','filteredText']].groupby(['asin'])['filteredText'].progress_apply(list))
df_books_bigReviews = df_books_bigReviews.reset_index()
df_books_bigReviews = df_books_bigReviews.assign(transactions = df_books_bigReviews['filteredText'].progress_apply(lambda reviews_lis:len(reviews_lis)))
df_books_bigReviews.head()

In [None]:
from apyori import apriori

# Support
# Support is an indication of how frequently the itemset appears in the dataset.
# Confidence
# Confidence is an indication of how often the rule has been found to be true.
# Lift
# The ratio of the observed support to that expected if X and Y were independent.
def apply_arm(transactions):
    return list(apriori(transactions, min_support = 1/len(transactions), min_confidence = 1, min_lift = len(transactions), max_length = 4))

In [None]:
books_with_arm = df_books_bigReviews[0:1000].assign(arm = df_books_bigReviews['filteredText'][0:1000].progress_apply(lambda list_of_reviews:apply_arm(list_of_reviews)))
books_with_arm.head()

In [None]:
def get_important_nouns(results):

    imp_nns = []
    for result in results:
        if len(list(result)) > 3:
            imp_nns = imp_nns + list(list(result))
    return imp_nns

In [None]:
imp_nns_df = books_with_arm.assign(imp_nns = books_with_arm['arm'].progress_apply(lambda arms:get_important_nouns(list(pd.DataFrame(arms)['items']))))
imp_nns_df.head()

In [None]:
imp_nns_df = imp_nns_df[['asin','imp_nns']]
imp_nns_df.head()

In [None]:
imp_nns_df.to_pickle("../data/interim/005_important_nouns.p")

In [None]:
# END OF FILE