In [41]:
%run lib.ipynb import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sshre35\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is

In [4]:
DATA_DIR = "../data/"
domains = ["ride", "investing", "health"]

In [36]:
# 3. preprocess - TOKENIZE, REMOVE STOPWORDS, REMOVE (PUNCUTATIONS, DOUBLE QUOTES)  
from collections import Counter

# reviews_word_arr: each row is (raw review, list of (raw sent, 1D words))
# apply it without lemmatization on review words
def remove_lower_tf_tokens(reviews_word_arr, min_freq = 3):
    
    # we break down each review in sentences and store each sentence independently
    # (raw review, list of (raw sent, 1D words))
    all_words = []
    formatted_review_sent_words = [] 
    for review, sent_words in reviews_word_arr:
        for raw_sent, words in sent_words:
            valid_words = [word for word in words if word not in custom_stop_words]
            all_words.extend(valid_words)
            formatted_review_sent_words.append((review, raw_sent, valid_words))
                    

    word_counts = Counter(all_words)

    # Remove words with frequency less than 4
    # (raw review, list of (raw sent, 1D words))
    cleaned_sent_reviews = []
    for raw_review, raw_sent, words in formatted_review_sent_words:
        cleaned_words = [word for word in words if word not in custom_stop_words and word_counts[word] > min_freq]
        # cleaned_review - each row is (raw review, raw sent, 1D)
        cleaned_sent_reviews.append((raw_review, raw_sent, cleaned_words))
    
    return cleaned_sent_reviews

# returns 2D - get list of list of words
# each row is a sentence
# each column is a word in that sentence
def extract_sent_words(text, remove_numbers=True):
    cleaned_text = clean_text(text)
    sent_tokens = []
    for sent in nltk.sent_tokenize(text):
        tokens = nltk_tokenize(sent.lower())
        tokens = nltk_remove_stopwords(tokens, custom_stop_words)
        # get each word of size more than 2
        filtered_words = clean_sent_words(tokens, 2) 
        # if there is atleast one word in a sentence
        if filtered_words:
            sent_tokens.append((sent, filtered_words))
    return sent_tokens # 2d - list of list of (raw sent, words)

# returns list of list of words (return_sent false)
# when return_sent is true: words (joined by space) are joined by ; create a sentence
def break_sent_into_words(review_text):
    # extracted_sent_words is 2D list
    # each sentence of list of words
    extracted_sent_words = extract_sent_words(review_text)
    return (review_text, extracted_sent_words)

# return 1D list where each row is a review, where item is a 2D list of sentences
def break_sentences_into_words(reviews_arr):
    return [break_sent_into_words(review_text) for review_text in reviews_arr]

In [37]:
def preprocess_reviews_arr(reviews_arr, lemma = True):
    tokenized_sent_reviews = break_sentences_into_words(reviews_arr)
    tokenized_reviews = remove_lower_tf_tokens(tokenized_sent_reviews)
    if lemma:
        review_arr_lemmatized = [(raw_review, review_sent, nltk_lemmatize_post_tag_rev_words(sent_words)) for (raw_review, review_sent, sent_words) in tokenized_reviews]
        return review_arr_lemmatized
    return tokenized_reviews

In [38]:
def load_raw_reviews(input_csv):
    reviews_df = pd.read_csv(input_csv, usecols=["Domain", "Name", "Title", "Date", "UserName", "Review", "Rating"])
    reviews_df["Review"] = reviews_df["Review"].apply(lambda x: x.encode("ascii", errors="ignore").decode())
    return reviews_df

In [42]:
# return_arr -> returns list of reviews where each review contains sentences separated by ;
def preprocess(df, lemmatize=False):
    reviews_arr = df["Review"].tolist()
    processed_sentences = preprocess_reviews_arr(reviews_arr, lemmatize)
    return processed_sentences

### Load raw reviews

In [25]:
domain_raw_reviews = {}

for domain in domains:
    input_csv_file = DATA_DIR + domain + ".csv"
    _df = load_raw_reviews(input_csv_file)
    domain_raw_reviews[domain] = _df

Columns:  Index(['Domain', 'Name', 'Title', 'Date', 'UserName', 'Review', 'Rating'], dtype='object')
Columns:  Index(['Domain', 'Name', 'Title', 'Date', 'UserName', 'Review', 'Rating'], dtype='object')
Columns:  Index(['Domain', 'Name', 'Title', 'Date', 'UserName', 'Review', 'Rating'], dtype='object')


In [26]:
domain_raw_reviews["ride"].head()

Unnamed: 0,Domain,Name,Title,Date,UserName,Review,Rating
0,RideHailing,99-private-drivers-and-taxi,Awful,12/14/20 0:00,veronica in new york,someone made an account on this app using my e...,1
1,RideHailing,99-private-drivers-and-taxi,Lost money on many transactions,7/8/21 15:05,nick22485,if you can use uber eats. this app has been la...,1
2,RideHailing,99-private-drivers-and-taxi,Let me turn off your annoying notifications,5/15/21 21:53,voska,"any other app, i would just disable notificati...",1
3,RideHailing,99-private-drivers-and-taxi,Can't retrieve my email,11/6/21 18:27,PNC.406,your security is pathetic. you allow users to ...,1
4,RideHailing,99-private-drivers-and-taxi,I want to set the addresses. Do not change it,8/26/21 11:35,rcv,when i want to call a ride for someone else th...,1


In [27]:
domain_raw_reviews["investing"].head()

Unnamed: 0,Domain,Name,Title,Date,UserName,Review,Rating
0,Investing,acorns-invest-spare-change,Great for Investing - Spend is garbage,7/7/22 22:25,Alex Beckett,"i like investing on here. yeah, savings are ma...",3
1,Investing,acorns-invest-spare-change,Unnecessarily Complicated,7/24/22 7:13,TeaCup Velvet,i am not going to pull any punches here. if yo...,2
2,Investing,acorns-invest-spare-change,Poor customer service,12/28/21 15:23,Jfishllc,i'm rating this app 1 star because of poor cus...,1
3,Investing,acorns-invest-spare-change,Transaction Dispute,8/28/22 20:38,victimofacorns,a fraudulent company accessed my checking acco...,1
4,Investing,acorns-invest-spare-change,"Great concept, but needs serious improvement",7/9/21 17:48,144278990,make unlinking accounts actually unlink accoun...,2


In [28]:
domain_raw_reviews["health"].head()

Unnamed: 0,Domain,Name,Title,Date,UserName,Review,Rating
0,Mental Health,aura-meditation-sleep,Finally able to fall asleep!,3/25/22 18:16,2005Phoenix,"i've tried many sleep apps the last few years,...",5
1,Mental Health,aura-meditation-sleep,Best Sleep Since 10+ Years,10/22/21 12:59,Marissa Lee B,i haven't been sleeping well honestly in these...,5
2,Mental Health,aura-meditation-sleep,Perfectly curated blend of options to sleep by,1/7/22 9:44,Mybellegirls,and reliable for its intended use xoxo! five g...,5
3,Mental Health,aura-meditation-sleep,Warning: canceling is extremely difficult,2/6/22 2:05,ndmel3,though the couple of items i listened to on au...,2
4,Mental Health,aura-meditation-sleep,"If I can, You certainly can...",2/21/21 20:41,AppFixation,i've never been a person that could stop doing...,5


### Tokenize review sentences

In [44]:
domain_tokenized_reviews = {}

for domain in domains:
    _df = domain_raw_reviews[domain]
    tokenized_reviews = preprocess(_df)
    domain_tokenized_reviews[domain] = tokenized_reviews

removing lower tf tokens reviews:  ("someone made an account on this app using my email address. i get all of their receipts, trip info, and customer service responses. and it's not me. they don't even have remotely the same name as me\n\ni have emailed them several times to ask them to stop emailing me. they had me describe myself to prove it's incorrect, and they're still emailing me. they should at least have some sort of email verification to prove their emailing a valid email address. \n\ni downloaded this app just to give it a negative review. maybe someone will finally listen and delete my email address because this is pathetic.", [('someone made an account on this app using my email address.', ['someone', 'made', 'account', 'app', 'using', 'email', 'address']), ('i get all of their receipts, trip info, and customer service responses.', ['get', 'receipts', 'trip', 'info', 'customer', 'service', 'responses']), ("they don't even have remotely the same name as me\n\ni have emailed 

cleaned sent reviews:  ("i like investing on here. yeah, savings are mainly through round-ups, but i've saved a lot. the spend account, however, neither makes sense nor connects to anything within acorns. you would think you could just move money between your other accounts but it really is just a one way bank account. so i have no idea what the point is. most people signed up using another bank. if you withdraw funds they go to that bank. you can't even move invest funds to spend. it seems like they will send you plenty of notifications about your unused spend card but not explain that it is a separate bank account that cannot be linked to your other accounts in any seemingly useful way.\n\nedit: i did talk to customer service again and figured out what was making spend more difficult. it might be a decent primary checking account, but i won't be using it for that. \none thing i do like is that round-ups are easy. the other thing, which is easy to forget, is that acorns will alert you

In [50]:
for domain in domains:
    _tokenized = domain_tokenized_reviews[domain]
    _df = pd.DataFrame(_tokenized, columns=["review", "sent" ,"tokenized"])
    _df["word count"] = _df["tokenized"].apply(lambda x: len(x))
    tokenized_file = DATA_DIR + domain + "_tokenized.csv"
    _df.to_csv(tokenized_file, index=False, header=True)

In [51]:
# e.g. tokenized reviews in health domain
pd.read_csv(DATA_DIR + "ride_tokenized.csv").head()

Unnamed: 0,review,sent,tokenized,word count
0,someone made an account on this app using my e...,someone made an account on this app using my e...,"['someone', 'made', 'account', 'app', 'using',...",7
1,someone made an account on this app using my e...,"i get all of their receipts, trip info, and cu...","['get', 'receipts', 'trip', 'info', 'customer'...",7
2,someone made an account on this app using my e...,they don't even have remotely the same name as...,"['even', 'remotely', 'name', 'emailed', 'sever...",9
3,someone made an account on this app using my e...,they had me describe myself to prove it's inco...,"['describe', 'prove', 'incorrect', 'still', 'e...",5
4,someone made an account on this app using my e...,they should at least have some sort of email v...,"['least', 'sort', 'email', 'verification', 'pr...",9


### Lemmatize sentence tokens

In [52]:
domain_lemmatized_reviews = {}

for domain in domains:
    tokenized_reviews = domain_tokenized_reviews[domain]
    lemmatized_reviews = [(raw_review, review_sent, sent_words, nltk_lemmatize_post_tag_rev_words(sent_words)) for (raw_review, review_sent, sent_words) in tokenized_reviews]
    domain_lemmatized_reviews[domain] = lemmatized_reviews

In [53]:
for domain in domains:
    lemmatized_reviews = domain_lemmatized_reviews[domain]
    _df = pd.DataFrame(lemmatized_reviews, columns=["review", "sent", "tokenized","lemmatized"])
    _df["tokenized"] = _df["tokenized"].apply(lambda x: ",".join(x))
    _df["word count"] = _df["lemmatized"].apply(lambda x: len(x.split(",")) if x else 0)
    lemmatized_file = DATA_DIR + domain + "_lemmatized.csv"
    _df.to_csv(lemmatized_file, index=False, header=True)

In [54]:
pd.read_csv(DATA_DIR + "ride_lemmatized.csv").head()

Unnamed: 0,review,sent,tokenized,lemmatized,word count
0,someone made an account on this app using my e...,someone made an account on this app using my e...,"someone,made,account,app,using,email,address","someone,make,account,app,use,email,address",7
1,someone made an account on this app using my e...,"i get all of their receipts, trip info, and cu...","get,receipts,trip,info,customer,service,responses","get,receipt,trip,info,customer,service,response",7
2,someone made an account on this app using my e...,they don't even have remotely the same name as...,"even,remotely,name,emailed,several,times,ask,s...","even,remotely,name,email,several,time,ask,stop...",9
3,someone made an account on this app using my e...,they had me describe myself to prove it's inco...,"describe,prove,incorrect,still,emailing","describe,prove,incorrect,still,email",5
4,someone made an account on this app using my e...,they should at least have some sort of email v...,"least,sort,email,verification,prove,emailing,v...","least,sort,email,verification,prove,email,vali...",9
