In [1]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

0.12.1


In [2]:
import nltk
# nltk.download()
# !pip install autocorrect
from autocorrect import spell

In [3]:
import pandas as pd
import numpy as np
import gzip

In [4]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

### Data Loading

In [5]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

In [6]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [7]:
df = getDF('/Users/falehalrashidi/Downloads/reviews_Books_5.json.gz')

In [8]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A10000012B7CGYKOMPQ4L,000100039X,Adam,"[0, 0]",Spiritually and mentally inspiring! A book tha...,5.0,Wonderful!,1355616000,"12 16, 2012"
1,A2S166WSCFIFP5,000100039X,"adead_poet@hotmail.com ""adead_poet@hotmail.com""","[0, 2]",This is one my must have books. It is a master...,5.0,close to god,1071100800,"12 11, 2003"
2,A1BM81XB4QHOA3,000100039X,"Ahoro Blethends ""Seriously""","[0, 0]",This book provides a reflection that you can a...,5.0,Must Read for Life Afficianados,1390003200,"01 18, 2014"
3,A1MOSTXNIO5MPJ,000100039X,Alan Krug,"[0, 0]",I first read THE PROPHET in college back in th...,5.0,Timeless for every good and bad time in your l...,1317081600,"09 27, 2011"
4,A2XQ5LZHTD4AFT,000100039X,Alaturka,"[7, 9]",A timeless classic. It is a very demanding an...,5.0,A Modern Rumi,1033948800,"10 7, 2002"


In [9]:
df1 = df[['reviewerID','asin','reviewText','helpful']]

In [10]:
df1.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]"
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]"
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]"
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]"
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]"


In [11]:
len(df1)

8898041

In [12]:
# Create new Column for the denominator
df2 = df1.assign(denom = df1['helpful'].progress_apply(lambda enum_denom:enum_denom[1]))

Progress:: 100%|██████████| 8898041/8898041 [00:06<00:00, 1405510.05it/s]


In [13]:
df2.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful,denom
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]",0
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]",2
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]",0
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]",0
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",9


In [14]:
# Create a Uniquekey Column
df3 = df2.assign(uniqueKey = df['reviewerID'].str.cat(df['asin'].values.astype(str), sep='##'))

In [15]:
df3.head()

Unnamed: 0,reviewerID,asin,reviewText,helpful,denom,uniqueKey
0,A10000012B7CGYKOMPQ4L,000100039X,Spiritually and mentally inspiring! A book tha...,"[0, 0]",0,A10000012B7CGYKOMPQ4L##000100039X
1,A2S166WSCFIFP5,000100039X,This is one my must have books. It is a master...,"[0, 2]",2,A2S166WSCFIFP5##000100039X
2,A1BM81XB4QHOA3,000100039X,This book provides a reflection that you can a...,"[0, 0]",0,A1BM81XB4QHOA3##000100039X
3,A1MOSTXNIO5MPJ,000100039X,I first read THE PROPHET in college back in th...,"[0, 0]",0,A1MOSTXNIO5MPJ##000100039X
4,A2XQ5LZHTD4AFT,000100039X,A timeless classic. It is a very demanding an...,"[7, 9]",9,A2XQ5LZHTD4AFT##000100039X


In [16]:
# Keep only the columns necessary for the normalisation
df4 = df3[['uniqueKey', 'reviewText']]
df4.head()

Unnamed: 0,uniqueKey,reviewText
0,A10000012B7CGYKOMPQ4L##000100039X,Spiritually and mentally inspiring! A book tha...
1,A2S166WSCFIFP5##000100039X,This is one my must have books. It is a master...
2,A1BM81XB4QHOA3##000100039X,This book provides a reflection that you can a...
3,A1MOSTXNIO5MPJ##000100039X,I first read THE PROPHET in college back in th...
4,A2XQ5LZHTD4AFT##000100039X,A timeless classic. It is a very demanding an...


The next step was necessary due to weird keyErrors that followed after trying to process the reviewText as a `pandas.DataFrame` and not as `pandas.Series`. After experimenting with both, I found that `pandas.Series.apply` is faster than `pandas.DataFrame.apply` and so I will hence work with `pandas.Series`. 

The assumption I require to make at this point before I follow is that `pandas` will not change the index of the reviews as those are being processed by my code and that in the end of my processing I will be able to re-associate those reviews with their **uniqueKey**. 

In [17]:
uniqueKey_series_df = df4[['uniqueKey']]
uniqueKey_series_df.head()

Unnamed: 0,uniqueKey
0,A10000012B7CGYKOMPQ4L##000100039X
1,A2S166WSCFIFP5##000100039X
2,A1BM81XB4QHOA3##000100039X
3,A1MOSTXNIO5MPJ##000100039X
4,A2XQ5LZHTD4AFT##000100039X


In [18]:
reviews_df = pd.DataFrame(df4['reviewText'].progress_apply(lambda review: review.split("\n")[0]))
reviews_df.head()

Progress:: 100%|██████████| 8898041/8898041 [00:09<00:00, 985818.68it/s] 


Unnamed: 0,reviewText
0,Spiritually and mentally inspiring! A book tha...
1,This is one my must have books. It is a master...
2,This book provides a reflection that you can a...
3,I first read THE PROPHET in college back in th...
4,A timeless classic. It is a very demanding an...


### Data Normalisation
<span style="color:red">**NOTICE:** As I am in a hurry to complete the first pipeline, I will only work with the first 100k reviews.</span>

* Tokenization <span style="color:blue"> DONE </span>
* Convert All Tokens to Lowercase <span style="color:blue"> DONE </span>
* Eliminate Punctuation <span style="color:blue"> DONE </span>
* Remove Stop Words <span style="color:blue"> DONE </span>
* Changing Numbers into Words <span style="color:blue"> DONE </span>
* Expand Abbreviations <span style="color:red"> NOT AS EASY AS I THOUGHT AND DOES NOT ADD MUCH VALUE (Ask Stasha's opinion)</span> 
* Correct Spelling <span style="color:red"> TOO SLOW (10h for 100k reviews)-->SO WONT DO</span>
* Substituting Tokens with Synonyms <span style="color:green"> TO DO</span>
* Semantical Marking of Negatives <span style="color:blue"> DONE (Ask Stasha's opinion) </span>

### Tokenization

In [19]:
step_0_df = reviews_df['reviewText'][0:99999].progress_apply(lambda review: nltk.word_tokenize(review))
step_0_df.head()

Progress:: 100%|██████████| 99999/99999 [01:18<00:00, 1274.79it/s]


0    [Spiritually, and, mentally, inspiring, !, A, ...
1    [This, is, one, my, must, have, books, ., It, ...
2    [This, book, provides, a, reflection, that, yo...
3    [I, first, read, THE, PROPHET, in, college, ba...
4    [A, timeless, classic, ., It, is, a, very, dem...
Name: reviewText, dtype: object

### Convert Tokens to Lowercase

In [20]:
import re
import string

def convert_to_lowercase(review):

    for i in range(len(review)):
        review[i] = review[i].lower()
    return review

In [21]:
step_1_df = step_0_df.progress_apply(lambda review: convert_to_lowercase(review))
step_1_df.head()

Progress:: 100%|██████████| 99999/99999 [00:01<00:00, 51196.29it/s]


0    [spiritually, and, mentally, inspiring, !, a, ...
1    [this, is, one, my, must, have, books, ., it, ...
2    [this, book, provides, a, reflection, that, yo...
3    [i, first, read, the, prophet, in, college, ba...
4    [a, timeless, classic, ., it, is, a, very, dem...
Name: reviewText, dtype: object

### Eliminate Punctuation

In [22]:
import re
import string

def eliminate_punctuation(review, regex):
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    return new_review

In [23]:
regex=re.compile('[%s]' % re.escape(string.punctuation))

step_2_df = step_1_df.progress_apply(lambda review: eliminate_punctuation(review, regex))
step_2_df.head()

Progress:: 100%|██████████| 99999/99999 [00:05<00:00, 17621.35it/s]


0    [spiritually, and, mentally, inspiring, a, boo...
1    [this, is, one, my, must, have, books, it, is,...
2    [this, book, provides, a, reflection, that, yo...
3    [i, first, read, the, prophet, in, college, ba...
4    [a, timeless, classic, it, is, a, very, demand...
Name: reviewText, dtype: object

In [32]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

step_3_df = step_2_df.progress_apply(lambda review: remove_stopwords(review))
step_3_df.head()


Progress::   0%|          | 0/99999 [00:00<?, ?it/s][A
Progress::   8%|▊         | 8147/99999 [00:00<00:01, 81468.33it/s][A
Progress::  16%|█▌        | 15553/99999 [00:00<00:01, 79093.80it/s][A
Progress::  23%|██▎       | 23455/99999 [00:00<00:00, 79071.16it/s][A
Progress::  33%|███▎      | 33062/99999 [00:00<00:00, 83502.64it/s][A
Progress::  44%|████▎     | 43562/99999 [00:00<00:00, 88967.06it/s][A
Progress::  51%|█████▏    | 51313/99999 [00:00<00:00, 85187.76it/s][A
Progress::  59%|█████▉    | 59013/99999 [00:00<00:00, 80181.36it/s][A
Progress::  67%|██████▋   | 66611/99999 [00:00<00:00, 78865.55it/s][A
Progress::  74%|███████▍  | 74144/99999 [00:00<00:00, 76100.16it/s][A
Progress::  83%|████████▎ | 82817/99999 [00:01<00:00, 79004.64it/s][A
Progress::  91%|█████████ | 90587/99999 [00:01<00:00, 77353.79it/s][A
Progress::  98%|█████████▊| 98239/99999 [00:01<00:00, 76796.10it/s][A
Progress:: 100%|██████████| 99999/99999 [00:01<00:00, 80127.21it/s][A

0    [spiritually, mentally, inspiring, book, allow...
1    [one, must, books, masterpiece, spirituality, ...
2    [book, provides, reflection, apply, lifeand, w...
3    [first, read, prophet, college, back, 60, book...
4    [timeless, classic, demanding, assuming, title...
Name: reviewText, dtype: object

### Changing Numbers into Words

In [34]:
import inflect
p = inflect.engine()

def numStringToWord(review, p):

    for i in range(len(review)):
        if(review[i].isdigit()):
            review[i] = p.number_to_words(review[i])
    return review

In [35]:
step_4_df = step_3_df.progress_apply(lambda review: numStringToWord(review, p))
step_4_df.head()


Progress::   0%|          | 0/99999 [00:00<?, ?it/s][A
Progress::   6%|▌         | 5623/99999 [00:00<00:01, 56228.85it/s][A
Progress::  11%|█         | 11007/99999 [00:00<00:01, 55488.85it/s][A
Progress::  16%|█▌        | 15943/99999 [00:00<00:01, 53495.60it/s][A
Progress::  21%|██▏       | 21294/99999 [00:00<00:01, 53497.37it/s][A
Progress::  27%|██▋       | 27253/99999 [00:00<00:01, 55190.19it/s][A
Progress::  34%|███▎      | 33695/99999 [00:00<00:01, 57666.60it/s][A
Progress::  40%|███▉      | 39520/99999 [00:00<00:01, 57839.75it/s][A
Progress::  45%|████▍     | 44872/99999 [00:00<00:01, 50725.04it/s][A
Progress::  50%|████▉     | 49801/99999 [00:00<00:01, 50064.28it/s][A
Progress::  55%|█████▍    | 54709/99999 [00:01<00:00, 47386.23it/s][A
Progress::  60%|█████▉    | 59761/99999 [00:01<00:00, 48276.33it/s][A
Progress::  65%|██████▍   | 64566/99999 [00:01<00:00, 47776.47it/s][A
Progress::  69%|██████▉   | 69329/99999 [00:01<00:00, 46729.23it/s][A
Progress::  74%|█████

0    [spiritually, mentally, inspiring, book, allow...
1    [one, must, books, masterpiece, spirituality, ...
2    [book, provides, reflection, apply, lifeand, w...
3    [first, read, prophet, college, back, sixty, b...
4    [timeless, classic, demanding, assuming, title...
Name: reviewText, dtype: object

### Correct Spelling

In [None]:
# from autocorrect import spell

# def spellCheck(review):

#     for i in range(len(review)):
#         review[i] = spell(review[i])
#     return review

In [None]:
# step_5_df = step_4_df.progress_apply(lambda review: spellCheck(review))
# step_5_df.head()

### Substituting Tokens with Synonyms

In [None]:
#TODO

### Semantical Marking of Negatives

In [36]:
def negation_tokenizer(review):

    # regex to match negation tokens
    negation_re = re.compile("""(?x)(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|
            hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|
            doesnt|didnt|isnt|arent|aint)$)|n't""")

    alter_re = re.compile("""(?x)(?:^(?:but|however|nevertheless|still|though|tho|yet)$)""")

    neg_review_tokens = []
    append_neg = False  # stores whether to add "_NEG"

    for token in review:

        # If append_neg is False
        if append_neg == False:

            # Check if the current token is a negation
            if negation_re.match(token):
                append_neg = True

        # but if a negation has been previously identified, check if this is an  alteration
        elif alter_re.match(token):
            append_neg = False

        # or if another negation appears
        elif negation_re.match(token):
            append_neg = False

        # and if not then append the suffix
        else:
            token += "_NEG"

        # append the new token in the return list
        neg_review_tokens.append(token)

    return neg_review_tokens

In [37]:
step_5_df = step_4_df.progress_apply(lambda review: negation_tokenizer(review))
step_5_df.head()


Progress::   0%|          | 0/99999 [00:00<?, ?it/s][A
Progress::   4%|▎         | 3716/99999 [00:00<00:02, 37158.53it/s][A
Progress::   6%|▋         | 6337/99999 [00:00<00:02, 33018.73it/s][A
Progress::   9%|▉         | 9143/99999 [00:00<00:02, 31349.95it/s][A
Progress::  12%|█▏        | 11741/99999 [00:00<00:02, 29519.40it/s][A
Progress::  14%|█▍        | 14309/99999 [00:00<00:03, 28235.97it/s][A
Progress::  17%|█▋        | 16933/99999 [00:00<00:03, 27601.99it/s][A
Progress::  20%|█▉        | 19896/99999 [00:00<00:02, 28177.42it/s][A
Progress::  23%|██▎       | 22581/99999 [00:00<00:02, 27765.38it/s][A
Progress::  26%|██▌       | 25953/99999 [00:00<00:02, 29318.12it/s][A
Progress::  30%|██▉       | 29760/99999 [00:01<00:02, 31485.19it/s][A
Progress::  33%|███▎      | 32879/99999 [00:01<00:02, 30185.61it/s][A
Progress::  38%|███▊      | 37692/99999 [00:01<00:01, 33985.66it/s][A
Progress::  41%|████      | 41242/99999 [00:01<00:01, 33831.64it/s][A
Progress::  45%|████▍  

0    [spiritually, mentally, inspiring, book, allow...
1    [one, must, books, masterpiece, spirituality, ...
2    [book, provides, reflection, apply, lifeand, w...
3    [first, read, prophet, college, back, sixty, b...
4    [timeless, classic, demanding, assuming, title...
Name: reviewText, dtype: object

In [38]:
step_5_df[1000]

['delightful',
 'read',
 'water',
 'elephants',
 'got',
 'one',
 'best',
 'reads',
 'anyone',
 'likes',
 'animals',
 'circuses',
 'wonderfully',
 'flowing',
 'story',
 'working',
 'circus',
 'especially',
 'past',
 'years',
 'one',
 'grueling',
 'tough',
 'jobs',
 'tackle',
 'sara',
 'gruen',
 'makes',
 'reader',
 'smell',
 'circus',
 'smells',
 'taste',
 'midway',
 'foods',
 'ring',
 'animals',
 'entertain',
 'circus',
 'since',
 'teenager',
 'many',
 'years',
 'ago',
 'water',
 'elephants',
 'took',
 'back',
 'days',
 'reminding',
 'things',
 'saw',
 'smelledhis',
 'family',
 'placed',
 'jacob',
 'jankowski',
 'home',
 'old',
 'folks',
 'none',
 'wanted_NEG',
 'take_NEG',
 'care_NEG',
 'sad_NEG',
 'true_NEG',
 'jacob_NEG',
 'sure_NEG',
 'ninety_NEG',
 'ninetythree_NEG',
 'years_NEG',
 'age_NEG',
 'knows_NEG',
 'age_NEG',
 'somewhere_NEG',
 'around_NEG',
 'range_NEG',
 'jacob_NEG',
 'hates_NEG',
 'old_NEG',
 'people_NEG',
 'home_NEG',
 'hates_NEG',
 'food_NEG',
 'hates_NEG',
 'nurses_