In [1]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')

In [2]:
import pandas as pd
import numpy as np
import datetime

In [4]:
headers = ["business_id", "stars", "useful", "funny", "cool", "text", "date"]
dtype = {"business_id" : str, 
         "stars" : np.float, 
         "useful" : np.float, 
         "funny" : np.float, 
         "cool" : np.float, 
         "text" : str, 
         "date" : str}
parse_dates = ['date']
df = pd.read_csv('data/restaurant_reviews.csv', header=0, names=headers, dtype=dtype, parse_dates=parse_dates)

In [45]:
df.tail()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
5055987,H8qpFitIesth86zqR4cwYg,5.0,43.0,40.0,45.0,"Confections, cash, and casinos! Welcome to Las...",2017-01-10 04:18:30
5055988,IsoLzudHC50oJLiEWpwV-w,3.0,1.0,3.0,1.0,Solid American food with a southern comfort fl...,2019-12-10 04:15:00
5055989,kDCyqlYcstqnoqnfBRS5Og,5.0,15.0,6.0,13.0,I'm honestly not sure how I have never been to...,2019-06-06 15:01:53
5055990,VKVDDHKtsdrnigeIf9S8RA,3.0,2.0,0.0,0.0,Food was decent but I will say the service too...,2018-07-05 18:45:21
5055991,2SbyRgHWuWNlq18eHAx95Q,5.0,2.0,0.0,2.0,"Oh yeah! Not only that the service was good, t...",2019-12-07 00:29:55


In [48]:
# Text normalization

# 1. Convert letters to lowercase
# 2. Remove numbers
# 3. Remove punctuation
# 4. Remove whitespaces
# 5. Remove stop words
# 6. Stem words

import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

REPLACE_NO_SPACE = re.compile("[!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]")

def preprocess_text(review):
    try: 
        # Convert letters to lowercase
        review = review.lower()
    
        # Remove numbers
        review = re.sub("\d+"," ", review)
    
        # Remove punctuations
        # review = review.translate(str.maketrans("", "", string.punctuation))
        review = review.translate(str.maketrans("", "", string.punctuation))
    
        # Remove whitespaces
        review = review.strip()
        
        # Remove stop words and use Porter Stemming Algorithm to stem words
        stop_words = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(review)
        review = [lemmatizer.lemmatize(i) for i in tokens if not i in stop_words]
        review = ' '.join(review)
    
    except AttributeError as error:
        print(review)
    
    return review


In [42]:
# Seperate columns

business_id = df["business_id"]
stars = df["stars"]
useful = df["useful"]
funny = df["funny"]
cool = df["cool"]
text = df["text"]
date = df["date"]

In [50]:
# Clean text column
df['text'] = df["text"].apply(preprocess_text)
# print(preprocess_text("This &is [an] example? {of} string. with.? punctuation!!!!"))

nan


In [52]:
df.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,1.0,0.0,0.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11
1,5JxlZaqCnk1MnbgRirs40Q,1.0,0.0,0.0,0.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52
2,IS4cv902ykd8wj1TR0N3-A,4.0,0.0,0.0,0.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57
3,nlxHRv1zXGT0c0K51q3jDg,5.0,2.0,0.0,0.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,1.0,0.0,0.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05


In [53]:
# One hot encode numerical columns 

stars_enc = pd.get_dummies(df["stars"], prefix="stars")
useful_enc = pd.get_dummies(df["useful"], prefix="useful")
funny_enc = pd.get_dummies(df["funny"], prefix="funny")
cool_enc = pd.get_dummies(df["cool"], prefix="cool")

In [54]:
df = pd.concat([business_id, stars_enc, useful_enc, funny_enc, cool_enc, text, date], axis=1)

In [55]:
df.to_csv('data/preprocessed_data.csv', encoding='utf-8', index=False)

In [56]:
df.head()

Unnamed: 0,business_id,stars_1.0,stars_2.0,stars_3.0,stars_4.0,stars_5.0,useful_-1.0,useful_0.0,useful_1.0,useful_2.0,...,cool_197.0,cool_199.0,cool_202.0,cool_205.0,cool_212.0,cool_222.0,cool_321.0,cool_502.0,text,date
0,HQl28KMwrEKHqhFrrDqVNQ,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11
1,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52
2,IS4cv902ykd8wj1TR0N3-A,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57
3,nlxHRv1zXGT0c0K51q3jDg,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25
4,Pthe4qk5xh4n-ef-9bvMSg,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,really good place simple decor amazing food gr...,2015-11-05 23:11:05
