In [128]:
import sys
# sys.path.append('/usr/local/lib/python3.7/site-packages')

In [129]:
import pandas as pd
import numpy as np
import datetime

In [130]:
headers = ["business_id", "stars", "useful", "funny", "cool", "text", "date"]
dtype = {"business_id" : str, 
         "stars" : np.int8, 
         "useful" : np.int8, 
         "funny" : np.int8, 
         "cool" : np.int8, 
         "text" : str, 
         "date" : str}
parse_dates = ['date']
df = pd.read_csv('data/test.csv', header=0, names=headers, dtype=dtype, parse_dates=parse_dates)

In [131]:
df.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,#NAME?,2,5,0,0,"As someone who has worked with many museums, I...",2015-04-15 05:21:00
1,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,I am actually horrified this place is still in...,2013-12-07 03:16:00
2,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,I love Deagan's. I do. I really do. The atmosp...,2015-12-05 03:18:00
3,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g...",2011-05-27 05:30:00
4,IS4cv902ykd8wj1TR0N3-A,4,0,0,0,"Oh happy day, finally have a Canes near my cas...",2017-01-14 21:56:00


In [132]:
# Text normalization

# 1. Convert letters to lowercase
# 2. Remove numbers
# 3. Remove punctuation
# 4. Remove whitespaces
# 5. Remove stop words
# 6. Stem words

import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_text(review):
    # Convert letters to lowercase
    review = review.lower()
    
    # Remove numbers
    review = re.sub("\d+"," ", review)
    
    # Remove punctuations
    review = REPLACE_NO_SPACE.sub("", review.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    
    # Remove whitespaces
    review = review.strip()
    
    # Remove stop words and use Porter Stemming Algorithm to stem words
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(review)
    review = [lemmatizer.lemmatize(i) for i in tokens if not i in stop_words]
    review = ' '.join(review)
    
    return review


In [133]:
# Seperate columns

business_id = df["business_id"]
stars = df["stars"]
useful = df["useful"]
funny = df["funny"]
cool = df["cool"]
text = df["text"]
date = df["date"]

In [134]:
# Clean text column
df['text'] = df["text"].apply(preprocess_text)

In [135]:
df.head()

Unnamed: 0,business_id,stars,useful,funny,cool,text,date
0,#NAME?,2,5,0,0,someone worked many museum eager visit gallery...,2015-04-15 05:21:00
1,lbrU8StCq3yDfr-QMnGrmQ,1,1,1,0,actually horrified place still business year o...,2013-12-07 03:16:00
2,HQl28KMwrEKHqhFrrDqVNQ,5,1,0,0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:00
3,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,dismal lukewarm defrosted tasting texmex glop ...,2011-05-27 05:30:00
4,IS4cv902ykd8wj1TR0N3-A,4,0,0,0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:00


In [136]:
# One hot encode numerical columns 

stars_enc = pd.get_dummies(df["stars"], prefix="stars")
useful_enc = pd.get_dummies(df["useful"], prefix="useful")
funny_enc = pd.get_dummies(df["funny"], prefix="funny")
cool_enc = pd.get_dummies(df["cool"], prefix="cool")

In [137]:
pd.concat([business_id, stars_enc, useful_enc, funny_enc, cool_enc, text, date], axis=1)

Unnamed: 0,business_id,stars_1,stars_2,stars_4,stars_5,useful_0,useful_1,useful_2,useful_3,useful_5,funny_0,funny_1,cool_0,cool_1,text,date
0,#NAME?,0,1,0,0,0,0,0,0,1,1,0,1,0,someone worked many museum eager visit gallery...,2015-04-15 05:21:00
1,lbrU8StCq3yDfr-QMnGrmQ,1,0,0,0,0,1,0,0,0,0,1,1,0,actually horrified place still business year o...,2013-12-07 03:16:00
2,HQl28KMwrEKHqhFrrDqVNQ,0,0,0,1,0,1,0,0,0,1,0,1,0,love deagans really atmosphere cozy festive sh...,2015-12-05 03:18:00
3,5JxlZaqCnk1MnbgRirs40Q,1,0,0,0,1,0,0,0,0,1,0,1,0,dismal lukewarm defrosted tasting texmex glop ...,2011-05-27 05:30:00
4,IS4cv902ykd8wj1TR0N3-A,0,0,1,0,1,0,0,0,0,1,0,1,0,oh happy day finally cane near casa yes others...,2017-01-14 21:56:00
5,nlxHRv1zXGT0c0K51q3jDg,0,0,0,1,0,0,1,0,0,1,0,1,0,definitely favorite fast food sub shop ingredi...,2013-05-07 07:25:00
6,Pthe4qk5xh4n-ef-9bvMSg,0,0,0,1,0,1,0,0,0,1,0,1,0,really good place simple decor amazing food gr...,2015-11-05 23:11:00
7,FNCJpSn0tL9iqoY3JC73qw,0,0,0,1,1,0,0,0,0,1,0,1,0,awesome office staff professional friendly saw...,2017-07-18 18:31:00
8,e_BiI4ej1CW1F0EyVLr-FQ,0,0,0,1,1,0,0,0,0,1,0,1,0,delicious authentic italian ive u wouldnt leav...,2015-02-16 06:48:00
9,Ws8V970-mQt2X9CwCuT5zw,0,0,1,0,0,0,0,1,0,1,0,0,1,twice nice laid back tried weekend southern me...,2009-10-13 04:16:00


In [138]:
df.to_csv('data/preprocessed_data.csv', encoding='utf-8', index=False)