In [15]:
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages')

In [16]:
import pandas as pd
import numpy as np
import datetime

In [72]:
headers = ["business_id", "stars", "useful", "funny", "cool", "text", "date"]
dtype = {"business_id" : str, 
         "stars" : np.float, 
         "useful" : np.float, 
         "funny" : np.float, 
         "cool" : np.float, 
         "text" : str, 
         "date" : str}
parse_dates = ['date']
df = pd.read_csv('data/restaurant_reviews.csv', header=0, names=headers, dtype=dtype, parse_dates=parse_dates)

In [60]:
df.tail()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
5055987,H8qpFitIesth86zqR4cwYg,5.0,43.0,40.0,45.0,"Confections, cash, and casinos! Welcome to Las...",2017-01-10 04:18:30
5055988,IsoLzudHC50oJLiEWpwV-w,3.0,1.0,3.0,1.0,Solid American food with a southern comfort fl...,2019-12-10 04:15:00
5055989,kDCyqlYcstqnoqnfBRS5Og,5.0,15.0,6.0,13.0,I'm honestly not sure how I have never been to...,2019-06-06 15:01:53
5055990,VKVDDHKtsdrnigeIf9S8RA,3.0,2.0,0.0,0.0,Food was decent but I will say the service too...,2018-07-05 18:45:21
5055991,2SbyRgHWuWNlq18eHAx95Q,5.0,2.0,0.0,2.0,"Oh yeah! Not only that the service was good, t...",2019-12-07 00:29:55


In [80]:
# Text normalization

# 1. Convert letters to lowercase
# 2. Remove numbers
# 3. Remove punctuation
# 4. Remove whitespaces
# 5. Remove stop words
# 6. Stem words

import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

REPLACE_NO_SPACE = re.compile("[!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]")

def preprocess_text(review):
    try: 
        # Convert letters to lowercase
        review = review.lower()
    
        # Remove numbers
        review = re.sub("\d+"," ", review)
    
        # Remove punctuations
        review = review.translate(str.maketrans("", "", string.punctuation))
    
        # Remove whitespaces
        review = review.strip()
        
        # Remove stop words and use lemmatization to stem words
        stop_words = set(stopwords.words("english"))
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(review)
        review = [lemmatizer.lemmatize(i) for i in tokens if not i in stop_words]
        review = ' '.join(review)
    
    except AttributeError as error:
        print(review)
    
    return review


In [74]:
# Seperate columns

business_id = df["business_id"]
stars = df["stars"]
useful = df["useful"]
funny = df["funny"]
cool = df["cool"]
text = df["text"]
date = df["date"]

In [75]:
# Clean text column
df['text'] = df["text"].apply(preprocess_text)
# print(preprocess_text("This &is [an] example? {of} string. with.? punctuation!!!!"))

nan


KeyboardInterrupt: 

In [64]:
df.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,1.0,0.0,0.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11
1,5JxlZaqCnk1MnbgRirs40Q,1.0,0.0,0.0,0.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52
2,IS4cv902ykd8wj1TR0N3-A,4.0,0.0,0.0,0.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57
3,nlxHRv1zXGT0c0K51q3jDg,5.0,2.0,0.0,0.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,1.0,0.0,0.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05


In [65]:
# One hot encode numerical columns 

useful_enc = pd.get_dummies(df["useful"], prefix="useful")
funny_enc = pd.get_dummies(df["funny"], prefix="funny")
cool_enc = pd.get_dummies(df["cool"], prefix="cool")

In [66]:
df = pd.concat([business_id, stars, useful_enc, funny_enc, cool_enc, text, date], axis=1)

In [67]:
df.to_csv('data/preprocessed_data.csv', encoding='utf-8', index=False)

In [69]:
df.head()
df.info()

KeyboardInterrupt: 

In [83]:
headers = ["business_id", "stars", "useful", "funny", "cool", "text", "date"]
dtype = {"business_id" : str, 
         "stars" : np.float, 
         "useful" : np.float, 
         "funny" : np.float, 
         "cool" : np.float, 
         "text" : str, 
         "date" : str}
# parse_dates = ['date']
df2 = pd.read_csv('data/preprocessed_data.csv', 
                  header=0, 
                  # names=headers, 
                  dtype=dtype, 
                  # parse_dates=parse_dates,
                  usecols=["business_id", "stars", "text", "date"])


In [81]:
# df2['text'] = df2["text"].apply(preprocess_text)

nan


KeyboardInterrupt: 

In [84]:
df2.head()

Unnamed: 0,business_id,stars,text,date
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11
1,5JxlZaqCnk1MnbgRirs40Q,1.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52
2,IS4cv902ykd8wj1TR0N3-A,4.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57
3,nlxHRv1zXGT0c0K51q3jDg,5.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05


In [85]:
df2.to_csv('data/preprocessed_data2.csv', encoding='utf-8', index=False)

In [86]:
df2.head()

Unnamed: 0,business_id,stars,text,date
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11
1,5JxlZaqCnk1MnbgRirs40Q,1.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52
2,IS4cv902ykd8wj1TR0N3-A,4.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57
3,nlxHRv1zXGT0c0K51q3jDg,5.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05


In [17]:
# headers = ["business_id", "stars", "useful", "funny", "cool", "text", "date"]
dtype = {"business_id" : str, 
         "stars" : np.float, 
         "useful" : np.float, 
         "funny" : np.float, 
         "cool" : np.float, 
         "text" : str, 
         "date" : str}
# parse_dates = ['date']
df3 = pd.read_csv('data/preprocessed_data2.csv', 
                  header=0, 
                  # names=headers, 
                  dtype=dtype, 
                  # parse_dates=parse_dates,
                  usecols=["business_id", "stars", "text", "date"])
df4 = pd.read_csv('data/restaurant_reviews.csv',
                  header=0,
                 dtype=dtype,
                 usecols=["useful", "funny", "cool"])

KeyboardInterrupt: 

In [None]:
df3.head()
df4.head()
df3["stars"].value_counts()

In [None]:
df3["useful"] = df4["useful"]
df3["funny"] = df4["funny"]
df3["cool"] = df4["cool"]

In [None]:
df3.head()

In [8]:
df3.to_csv('data/preprocessed_data3.csv', encoding='utf-8', index=False)

In [20]:
dtype = {"business_id" : str, 
         "stars" : np.float, 
         "useful" : np.float, 
         "funny" : np.float, 
         "cool" : np.float, 
         "text" : str, 
         "date" : str}
# parse_dates = ['date']
df4 = pd.read_csv('data/preprocessed_data3.csv', 
                  header=0, 
                  # names=headers, 
                  dtype=dtype, 
                  # parse_dates=parse_dates,
                  )

In [21]:
df4.head()

Unnamed: 0,business_id,stars,text,date,useful,funny,cool
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11,1.0,0.0,0.0
1,5JxlZaqCnk1MnbgRirs40Q,1.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52,0.0,0.0,0.0
2,IS4cv902ykd8wj1TR0N3-A,4.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57,0.0,0.0,0.0
3,nlxHRv1zXGT0c0K51q3jDg,5.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25,2.0,0.0,0.0
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05,1.0,0.0,0.0


In [23]:
df4["positive"] = df4.apply(lambda row : 1 if row["stars"] >= 3 else 0, axis=1)

In [26]:
df4.head()

Unnamed: 0,business_id,stars,text,date,useful,funny,cool,positive
0,HQl28KMwrEKHqhFrrDqVNQ,5.0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:11,1.0,0.0,0.0,1
1,5JxlZaqCnk1MnbgRirs40Q,1.0,dismal lukewarm defrostedtasting texmex glop m...,2011-05-27 05:30:52,0.0,0.0,0.0,0
2,IS4cv902ykd8wj1TR0N3-A,4.0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:57,0.0,0.0,0.0,1
3,nlxHRv1zXGT0c0K51q3jDg,5.0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:25,2.0,0.0,0.0,1
4,Pthe4qk5xh4n-ef-9bvMSg,5.0,really good place simple decor amazing food gr...,2015-11-05 23:11:05,1.0,0.0,0.0,1


In [27]:
df4.to_csv('data/preprocessed_data4.csv', encoding='utf-8', index=False)