#### Import relevant libraries

In [1]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import contractions

#### Load dataset

In [2]:
reviews_data = pd.read_csv("data/a1_RestaurantReviews_HistoricDump.tsv", sep='\t')

#### Inspect first 5 rows and data types of the dataset

In [3]:
reviews_data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
reviews_data.shape

(900, 2)

#### Expand Contractions

In [5]:
reviews_data['no_contractions'] = reviews_data['Review'].apply(lambda x: [contractions.fix(word) for word in x.split()])
reviews_data.head(7)

Unnamed: 0,Review,Liked,no_contractions
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]"
1,Crust is not good.,0,"[Crust, is, not, good.]"
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas..."
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an..."
5,Now I am getting angry and I want my damn pho.,0,"[Now, I, am, getting, angry, and, I, want, my,..."
6,Honeslty it didn't taste THAT fresh.),0,"[Honeslty, it, did not, taste, THAT, fresh.)]"


In [6]:
reviews_data['reviews_no_contractions'] = [' '.join(l) for l in reviews_data['no_contractions']]
reviews_data.head(7)

Unnamed: 0,Review,Liked,no_contractions,reviews_no_contractions
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]",Wow... Loved this place.
1,Crust is not good.,0,"[Crust, is, not, good.]",Crust is not good.
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas...",Not tasty and the texture was just nasty.
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho...",Stopped by during the late May bank holiday of...
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an...",The selection on the menu was great and so wer...
5,Now I am getting angry and I want my damn pho.,0,"[Now, I, am, getting, angry, and, I, want, my,...",Now I am getting angry and I want my damn pho.
6,Honeslty it didn't taste THAT fresh.),0,"[Honeslty, it, did not, taste, THAT, fresh.)]",Honeslty it did not taste THAT fresh.)


#### Tokenize data

In [7]:
reviews_data['reviews_tokenized'] = reviews_data['reviews_no_contractions'].apply(word_tokenize)
reviews_data.head()

Unnamed: 0,Review,Liked,no_contractions,reviews_no_contractions,reviews_tokenized
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]",Wow... Loved this place.,"[Wow, ..., Loved, this, place, .]"
1,Crust is not good.,0,"[Crust, is, not, good.]",Crust is not good.,"[Crust, is, not, good, .]"
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas...",Not tasty and the texture was just nasty.,"[Not, tasty, and, the, texture, was, just, nas..."
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho...",Stopped by during the late May bank holiday of...,"[Stopped, by, during, the, late, May, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an...",The selection on the menu was great and so wer...,"[The, selection, on, the, menu, was, great, an..."


#### Convert data to lower case

In [8]:
reviews_data['reviews_lower'] = reviews_data['reviews_tokenized'].apply(lambda x: [word.lower() for word in x])
reviews_data.head()

Unnamed: 0,Review,Liked,no_contractions,reviews_no_contractions,reviews_tokenized,reviews_lower
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]",Wow... Loved this place.,"[Wow, ..., Loved, this, place, .]","[wow, ..., loved, this, place, .]"
1,Crust is not good.,0,"[Crust, is, not, good.]",Crust is not good.,"[Crust, is, not, good, .]","[crust, is, not, good, .]"
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas...",Not tasty and the texture was just nasty.,"[Not, tasty, and, the, texture, was, just, nas...","[not, tasty, and, the, texture, was, just, nas..."
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho...",Stopped by during the late May bank holiday of...,"[Stopped, by, during, the, late, May, bank, ho...","[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an...",The selection on the menu was great and so wer...,"[The, selection, on, the, menu, was, great, an...","[the, selection, on, the, menu, was, great, an..."


#### Remove punctuations

In [9]:
punctuations = string.punctuation
reviews_data['reviews_no_punctuation'] = reviews_data['reviews_lower'].apply(lambda x: [word for word in x if word not in punctuations])
reviews_data.head()

Unnamed: 0,Review,Liked,no_contractions,reviews_no_contractions,reviews_tokenized,reviews_lower,reviews_no_punctuation
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]",Wow... Loved this place.,"[Wow, ..., Loved, this, place, .]","[wow, ..., loved, this, place, .]","[wow, ..., loved, this, place]"
1,Crust is not good.,0,"[Crust, is, not, good.]",Crust is not good.,"[Crust, is, not, good, .]","[crust, is, not, good, .]","[crust, is, not, good]"
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas...",Not tasty and the texture was just nasty.,"[Not, tasty, and, the, texture, was, just, nas...","[not, tasty, and, the, texture, was, just, nas...","[not, tasty, and, the, texture, was, just, nasty]"
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho...",Stopped by during the late May bank holiday of...,"[Stopped, by, during, the, late, May, bank, ho...","[stopped, by, during, the, late, may, bank, ho...","[stopped, by, during, the, late, may, bank, ho..."
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an...",The selection on the menu was great and so wer...,"[The, selection, on, the, menu, was, great, an...","[the, selection, on, the, menu, was, great, an...","[the, selection, on, the, menu, was, great, an..."


#### Convert output back to string

In [10]:
reviews_data['reviews_cleaned'] = [' '.join(l) for l in reviews_data['reviews_no_punctuation']]
reviews_data.head()

Unnamed: 0,Review,Liked,no_contractions,reviews_no_contractions,reviews_tokenized,reviews_lower,reviews_no_punctuation,reviews_cleaned
0,Wow... Loved this place.,1,"[Wow..., Loved, this, place.]",Wow... Loved this place.,"[Wow, ..., Loved, this, place, .]","[wow, ..., loved, this, place, .]","[wow, ..., loved, this, place]",wow ... loved this place
1,Crust is not good.,0,"[Crust, is, not, good.]",Crust is not good.,"[Crust, is, not, good, .]","[crust, is, not, good, .]","[crust, is, not, good]",crust is not good
2,Not tasty and the texture was just nasty.,0,"[Not, tasty, and, the, texture, was, just, nas...",Not tasty and the texture was just nasty.,"[Not, tasty, and, the, texture, was, just, nas...","[not, tasty, and, the, texture, was, just, nas...","[not, tasty, and, the, texture, was, just, nasty]",not tasty and the texture was just nasty
3,Stopped by during the late May bank holiday of...,1,"[Stopped, by, during, the, late, May, bank, ho...",Stopped by during the late May bank holiday of...,"[Stopped, by, during, the, late, May, bank, ho...","[stopped, by, during, the, late, may, bank, ho...","[stopped, by, during, the, late, may, bank, ho...",stopped by during the late may bank holiday of...
4,The selection on the menu was great and so wer...,1,"[The, selection, on, the, menu, was, great, an...",The selection on the menu was great and so wer...,"[The, selection, on, the, menu, was, great, an...","[the, selection, on, the, menu, was, great, an...","[the, selection, on, the, menu, was, great, an...",the selection on the menu was great and so wer...


#### Export Cleaned Text data

In [11]:
reviews_data[['Review','Liked','reviews_cleaned']].to_csv('data/cleaned_reviews_data.csv',index = False)