In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import demoji
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Download data to perform future tasks
nltk.download('wordnet')
demoji.download_codes()

#Create NLTK lemmatizer
lemmatizer = WordNetLemmatizer()

#Store stop words
stop_words = stopwords.words('english')


#URL for mobile electronic Amazon reviews
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Mobile_Electronics_v1_00.tsv.gz'

#Data acquisition
df = pd.read_csv(url, sep='\t', error_bad_lines=False)

#Removing nulls
df = df.dropna()

#Making sure review date is a datetime
df['review_date'] = pd.to_datetime(df['review_date'])

#Removes html tags and handles special characters
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    new_text = soup.get_text(separator=" ")
    return new_text

#Turns emojis into text
def process_emojis(text):
    #Detect any emojis in text string
    found_emojis = demoji.findall(text)
    if found_emojis:
        #Loop through each emoji found
        for key, value in found_emojis.items():
            #replace with emoji text.. add spaces to account for emoji's right next to each other
            text = (text.replace(key, ' ' + value[:value.find(':') if ':' in value else len(value)] + ' ')
                    .replace('🏽', ''))
        #using regex to find double spaces and replace with single space, then return new text string
        return re.sub(' +', ' ', text)
    return text

#Lowercases text
def lowercase_text(text):
    return text.lower()

#Removes extra whitespace
def remove_whitespace(text):
    return re.sub(' +', ' ', text.strip())

#Fixes contractions
def fix_contractions(text):
    return contractions.fix(text)

#Tokenizes text data
def tokenize(text):
    return nltk.word_tokenize(text)

#Removes common stop words
def remove_stop_words(tokens):
    return [i for i in tokens if i not in stop_words]

#Lemmatizes text data
def lemmatize(tokens):
    return [lemmatizer.lemmatize(i) for i in tokens]

#Performs each of the text processing functions above and returns a list of tokenized text data
def process_text(text):
    text = strip_html_tags(text)
    text = fix_contractions(text)
    text = remove_whitespace(text)
    text = process_emojis(text)
    text = lowercase_text(text)
    tokens = tokenize(text)
    tokens = remove_stop_words(tokens)
    tokens = lemmatize(tokens)
    return tokens

df['processed_text'] = [process_text(i) for i in df['review_body'].values.tolist()]

Downloading emoji data ...


[nltk_data] Downloading package wordnet to /Users/kmf229/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


... OK (Got response in 0.17 seconds)
Writing emoji data to /Users/kmf229/.demoji/codes.json ...
... OK


b'Skipping line 35246: expected 15 fields, saw 22\n'
b'Skipping line 87073: expected 15 fields, saw 22\n'


In [2]:
#Sample Data
df[['product_id', 'product_title', 'product_category', 'star_rating', 'helpful_votes', 'total_votes', 'review_headline', 'review_body', 'review_date', 'processed_text']].head()

Unnamed: 0,product_id,product_title,product_category,star_rating,helpful_votes,total_votes,review_headline,review_body,review_date,processed_text
0,B00MC4CED8,BlackVue DR600GW-PMP,Mobile_Electronics,5.0,0.0,0.0,Very Happy!,"As advertised. Everything works perfectly, I'm...",2015-08-31,"[advertised, ., everything, work, perfectly, ,..."
1,B00OQMFG1Q,GENSSI GSM / GPS Two Way Smart Phone Car Alarm...,Mobile_Electronics,5.0,0.0,1.0,five star,it's great,2015-08-31,[great]
2,B00QERR5CY,iXCC Multi pack Lightning cable,Mobile_Electronics,5.0,0.0,0.0,great cables,These work great and fit my life proof case fo...,2015-08-31,"[work, great, fit, life, proof, case, iphone, 6]"
3,B00QUFTPV4,abcGoodefg® FBI Covert Acoustic Tube Earpiece ...,Mobile_Electronics,4.0,0.0,0.0,Work very well but couldn't get used to not he...,Work very well but couldn't get used to not he...,2015-08-31,"[work, well, could, get, used, hearing, anythi..."
4,B0067XVNTG,Generic Car Dashboard Video Camera Vehicle Vid...,Mobile_Electronics,2.0,0.0,0.0,Cameras has battery issues,"Be careful with these products, I have bought ...",2015-08-31,"[careful, product, ,, bought, several, camera,..."
