In [None]:
import plotly 
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
print(cf.__version__)
# Configure cufflings 
cf.set_config_file(offline=False, world_readable=True, theme='pearl')

In [None]:
import nltk
# nltk.download()
# !pip install autocorrect
from autocorrect import spell

In [None]:
import pandas as pd
import numpy as np
import gzip

In [None]:
# For monitoring duration of pandas processes
from tqdm import tqdm, tqdm_pandas

# To avoid RuntimeError: Set changed size during iteration
tqdm.monitor_interval = 0

# Register `pandas.progress_apply` and `pandas.Series.map_apply` with `tqdm`
# (can use `tqdm_gui`, `tqdm_notebook`, optional kwargs, etc.)
tqdm.pandas(desc="Progress:")

# Now you can use `progress_apply` instead of `apply`
# and `progress_map` instead of `map`
# can also groupby:
# df.groupby(0).progress_apply(lambda x: x**2)

### Data Loading

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

In [None]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [None]:
df = getDF('/Users/falehalrashidi/Downloads/reviews_Books_5.json.gz')

In [None]:
df.head()

In [None]:
df1 = df[['reviewerID','asin','reviewText','helpful']]

In [None]:
df1.head()

In [None]:
len(df1)

In [None]:
# Create new Column for the denominator
df2 = df1.assign(denom = df1['helpful'].progress_apply(lambda enum_denom:enum_denom[1]))

In [None]:
df2.head()

In [None]:
# Create a Uniquekey Column
df3 = df2.assign(uniqueKey = df['reviewerID'].str.cat(df['asin'].values.astype(str), sep='##'))

In [None]:
df3.head()

In [None]:
# Keep only the columns necessary for the normalisation
df4 = df3[['uniqueKey', 'reviewText']]
df4.head()

The next step was necessary due to weird keyErrors that followed after trying to process the reviewText as a `pandas.DataFrame` and not as `pandas.Series`. After experimenting with both, I found that `pandas.Series.apply` is faster than `pandas.DataFrame.apply` and so I will hence work with `pandas.Series`. 

The assumption I require to make at this point before I follow is that `pandas` will not change the index of the reviews as those are being processed by my code and that in the end of my processing I will be able to re-associate those reviews with their **uniqueKey**. 

In [None]:
uniqueKey_series_df = df4[['uniqueKey']]
uniqueKey_series_df.head()

In [None]:
reviews_df = pd.DataFrame(df4['reviewText'].progress_apply(lambda review: review.split("\n")[0]))
reviews_df.head()

### Data Normalisation
<span style="color:red">**NOTICE:** As I am in a hurry to complete the first pipeline, I will only work with the first 100k reviews.</span>

* Tokenization <span style="color:blue"> DONE </span>
* Convert All Tokens to Lowercase <span style="color:blue"> DONE </span>
* Eliminate Punctuation <span style="color:blue"> DONE </span>
* Remove Stop Words <span style="color:blue"> DONE </span>
* Changing Numbers into Words <span style="color:blue"> DONE </span>
* Expand Abbreviations <span style="color:red"> NOT AS EASY AS I THOUGHT AND DOES NOT ADD MUCH VALUE (Ask Stasha's opinion)</span> 
* Correct Spelling <span style="color:red"> TOO SLOW (10h for 100k reviews)-->SO WONT DO</span>
* Substituting Tokens with Synonyms <span style="color:green"> TO DO</span>
* Semantical Marking of Negatives <span style="color:blue"> DONE (Ask Stasha's opinion) </span>

### Tokenization

In [None]:
step_0_df = reviews_df['reviewText'][0:99999].progress_apply(lambda review: nltk.word_tokenize(review))
step_0_df.head()

### Convert Tokens to Lowercase

In [None]:
import re
import string

def convert_to_lowercase(review):

    for i in range(len(review)):
        review[i] = review[i].lower()
    return review

In [None]:
step_1_df = step_0_df.progress_apply(lambda review: convert_to_lowercase(review))
step_1_df.head()

### Eliminate Punctuation

In [None]:
import re
import string

def eliminate_punctuation(review, regex):
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token)
        if not new_token == u'':
            new_review.append(new_token)
    return new_review

In [None]:
regex=re.compile('[%s]' % re.escape(string.punctuation))

step_2_df = step_1_df.progress_apply(lambda review: eliminate_punctuation(review, regex))
step_2_df.head()

In [None]:
### Remove Stop Words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(review):
    return [token for token in review if not token in stop_words]

step_3_df = step_2_df.progress_apply(lambda review: remove_stopwords(review))
step_3_df.head()

### Changing Numbers into Words

In [None]:
import inflect
p = inflect.engine()

def numStringToWord(review, p):

    for i in range(len(review)):
        if(review[i].isdigit()):
            review[i] = p.number_to_words(review[i])
    return review

In [None]:
step_4_df = step_3_df.progress_apply(lambda review: numStringToWord(review, p))
step_4_df.head()

### Correct Spelling

In [None]:
# from autocorrect import spell

# def spellCheck(review):

#     for i in range(len(review)):
#         review[i] = spell(review[i])
#     return review

In [None]:
# step_5_df = step_4_df.progress_apply(lambda review: spellCheck(review))
# step_5_df.head()

### Substituting Tokens with Synonyms

In [None]:
#TODO

### Semantical Marking of Negatives

In [None]:
def negation_tokenizer(review):

    # regex to match negation tokens
    negation_re = re.compile("""(?x)(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|
            hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|
            doesnt|didnt|isnt|arent|aint)$)|n't""")

    alter_re = re.compile("""(?x)(?:^(?:but|however|nevertheless|still|though|tho|yet)$)""")

    neg_review_tokens = []
    append_neg = False  # stores whether to add "_NEG"

    for token in review:

        # If append_neg is False
        if append_neg == False:

            # Check if the current token is a negation
            if negation_re.match(token):
                append_neg = True

        # but if a negation has been previously identified, check if this is an  alteration
        elif alter_re.match(token):
            append_neg = False

        # or if another negation appears
        elif negation_re.match(token):
            append_neg = False

        # and if not then append the suffix
        else:
            token += "_NEG"

        # append the new token in the return list
        neg_review_tokens.append(token)

    return neg_review_tokens

In [None]:
step_5_df = step_4_df.progress_apply(lambda review: negation_tokenizer(review))
step_5_df.head()

In [None]:
step_5_df[1000]