# Setup

Install required libraries

In [None]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 7.9 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 70.9 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.21


Import required libraries

In [None]:
import numpy as np
import pandas as pd

Mount storage from Google Drive

In [None]:
from google.colab import drive
drive.mount('p2')

Mounted at p2


# Dataset

Load sampled dataset

In [None]:
df = pd.read_csv('/content/p2/MyDrive/p2/data/reviews_500k_imba.csv')
df.head()

Unnamed: 0,business_id,stars,text,categories
0,3uC7Lbc3RgUDTWQlBu4PqQ,5.0,Three words: Damn good pastries.\n\nA few mor...,"['Desserts', 'Food', 'French', 'Sandwiches', '..."
1,c-NXKTJ0jrrusTPxJAUwvA,1.0,Easily one of the worst Red Robin locations. T...,"['American (Traditional)', 'Restaurants', 'Bur..."
2,j3csEfGzkwnXATdRoZDT-A,2.0,Maybe I am just spoiled with good Mexican food...,"['Mexican', 'Restaurants']"
3,Q0EZmATxDphzRMszNV2LVg,5.0,This Wildflower is always kept clean and the e...,"['Food', 'American (New)', 'Restaurants', 'Bre..."
4,25c15dEPrBrWr4tR1r6sTg,5.0,Favorite bibimbap in the valley! They also hav...,"['Korean', 'Japanese', 'Restaurants']"


Inspect distribution of star labels

In [None]:
df['stars'].value_counts()

5.0    197007
4.0    130723
3.0     66808
1.0     59025
2.0     46437
Name: stars, dtype: int64

# Preprocessing

Remove unused columns

In [None]:
df = df[['text', 'stars']]
df

Unnamed: 0,text,stars
0,Three words: Damn good pastries.\n\nA few mor...,5.0
1,Easily one of the worst Red Robin locations. T...,1.0
2,Maybe I am just spoiled with good Mexican food...,2.0
3,This Wildflower is always kept clean and the e...,5.0
4,Favorite bibimbap in the valley! They also hav...,5.0
...,...,...
499995,"New Nak Won is amazing!\n\nFirst off, super aw...",5.0
499996,I came here for lunch last Sunday. We ordered...,3.0
499997,We just tried Rkidds for the first time tonigh...,4.0
499998,"Yesterday I was served Kobe hot dogs, chipotle...",5.0


Minus 1 so the star labels range from 0-4 to prepare for one-hot encodings

In [None]:
df.loc[:,'stars'] -= 1
df

Unnamed: 0,text,stars
0,Three words: Damn good pastries.\n\nA few mor...,4.0
1,Easily one of the worst Red Robin locations. T...,0.0
2,Maybe I am just spoiled with good Mexican food...,1.0
3,This Wildflower is always kept clean and the e...,4.0
4,Favorite bibimbap in the valley! They also hav...,4.0
...,...,...
499995,"New Nak Won is amazing!\n\nFirst off, super aw...",4.0
499996,I came here for lunch last Sunday. We ordered...,2.0
499997,We just tried Rkidds for the first time tonigh...,3.0
499998,"Yesterday I was served Kobe hot dogs, chipotle...",4.0


The example text before prepocessing

In [None]:
df['text'][1]

"Easily one of the worst Red Robin locations. The food was delicious but the service was agonizingly atrocious. Went with Mom during lunch service and after we were directed to a table, the service plummeted. Our server wasn't at all hospitable or attentive. Smothered tables with much larger tickets, paid almost no attention to us with our two Tavern Doubles. I felt like we were a bother to her. Wish I could remember her name. Very poor service for such a gorgeous location."

Expand contractions for the review text. Eg: I'm to I am

In [None]:
import contractions

df.loc[440678, 'text'] = df['text'][440678].replace("İ", "I") # The special symbol in this specific row is causing error when expanding
df['processed_text'] = [contractions.fix(text) for text in df['text']]
df['processed_text'][1]

'Easily one of the worst Red Robin locations. The food was delicious but the service was agonizingly atrocious. Went with Mom during lunch service and after we were directed to a table, the service plummeted. Our server was not at all hospitable or attentive. Smothered tables with much larger tickets, paid almost no attention to us with our two Tavern Doubles. I felt like we were a bother to her. Wish I could remember her name. Very poor service for such a gorgeous location.'

Convert to lowercase and remove extra space

In [None]:
import nltk
nltk.download('punkt')

%pprint

df['processed_text'] = [nltk.word_tokenize(text.lower()) for text in df['processed_text']]

[w for w in df['processed_text'][1]]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Pretty printing has been turned ON


['easily',
 'one',
 'of',
 'the',
 'worst',
 'red',
 'robin',
 'locations',
 '.',
 'the',
 'food',
 'was',
 'delicious',
 'but',
 'the',
 'service',
 'was',
 'agonizingly',
 'atrocious',
 '.',
 'went',
 'with',
 'mom',
 'during',
 'lunch',
 'service',
 'and',
 'after',
 'we',
 'were',
 'directed',
 'to',
 'a',
 'table',
 ',',
 'the',
 'service',
 'plummeted',
 '.',
 'our',
 'server',
 'was',
 'not',
 'at',
 'all',
 'hospitable',
 'or',
 'attentive',
 '.',
 'smothered',
 'tables',
 'with',
 'much',
 'larger',
 'tickets',
 ',',
 'paid',
 'almost',
 'no',
 'attention',
 'to',
 'us',
 'with',
 'our',
 'two',
 'tavern',
 'doubles',
 '.',
 'i',
 'felt',
 'like',
 'we',
 'were',
 'a',
 'bother',
 'to',
 'her',
 '.',
 'wish',
 'i',
 'could',
 'remember',
 'her',
 'name',
 '.',
 'very',
 'poor',
 'service',
 'for',
 'such',
 'a',
 'gorgeous',
 'location',
 '.']

Remove punctuations

In [None]:
df['processed_text'] = [[token for token in tokens if token.isalnum()] for tokens in df.processed_text]
' '.join(df['processed_text'][1])

'easily one of the worst red robin locations the food was delicious but the service was agonizingly atrocious went with mom during lunch service and after we were directed to a table the service plummeted our server was not at all hospitable or attentive smothered tables with much larger tickets paid almost no attention to us with our two tavern doubles i felt like we were a bother to her wish i could remember her name very poor service for such a gorgeous location'

Mark the immediate word after negation

In [None]:
negation_words = ['not', 'no', 'never', 'none', 'nobody', 'nothing', 'neither', 'nowhere']

for tokens in df.processed_text:
  for index, token in enumerate(tokens):
    if token in negation_words and index != len(tokens)-1:
      tokens[index + 1] = "!" + tokens[index + 1]

' '.join(df['processed_text'][1])

'easily one of the worst red robin locations the food was delicious but the service was agonizingly atrocious went with mom during lunch service and after we were directed to a table the service plummeted our server was not !at all hospitable or attentive smothered tables with much larger tickets paid almost no !attention to us with our two tavern doubles i felt like we were a bother to her wish i could remember her name very poor service for such a gorgeous location'

Remove English stop words

In [None]:
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words("english")
df['processed_text'] = [[token for token in tokens if token not in stopwords] for tokens in df.processed_text]
' '.join(df['processed_text'][1])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


'easily one worst red robin locations food delicious service agonizingly atrocious went mom lunch service directed table service plummeted server !at hospitable attentive smothered tables much larger tickets paid almost !attention us two tavern doubles felt like bother wish could remember name poor service gorgeous location'

Lemmatize tokens

In [None]:
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

df['processed_text'] = [[lemmatizer.lemmatize(token) for token in tokens] for tokens in df.processed_text]

' '.join(df['processed_text'][1])

'easily one worst red robin location food delicious service agonizingly atrocious went mom lunch service directed table service plummeted server !at hospitable attentive smothered table much larger ticket paid almost !attention u two tavern double felt like bother wish could remember name poor service gorgeous location'

Join the processed tokens back into text

In [None]:
df['processed_text'] = [str.join(' ', tokens) for tokens in df.processed_text]
df['processed_text'][1]

'easily one worst red robin location food delicious service agonizingly atrocious went mom lunch service directed table service plummeted server !at hospitable attentive smothered table much larger ticket paid almost !attention u two tavern double felt like bother wish could remember name poor service gorgeous location'

Save the preprocessed data frame for future use

In [None]:
df.to_csv('/content/p2/MyDrive/p2/data/preprocessed_500k_imba.csv', index=False)