# Yelp Data Cleaning Demo - Zion

## This is a demo of the data cleaning pipeline and the visual checks I go through

## Read in text data

In [70]:
import pandas as pd
import numpy as np

yelpDataset = pd.read_csv('Yelp.txt', sep='\t', header=None, encoding='latin-1')
yelpDataset.columns = ['review', 'sentiment']

yelpDataset.head()

Unnamed: 0,review,sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Exploring the dataset

### What is the shape of the dataset?

In [71]:
print ('Input data has {} rows and {} columns.'.format(len(yelpDataset), len(yelpDataset.columns)))

Input data has 1000 rows and 2 columns.


### How many reviews are positive/negative?

In [72]:
print('Out of {} rows, {} are positive (1) and {} negative (0).'.format(len(yelpDataset),
                                                                       len(yelpDataset[yelpDataset['sentiment'] == 1]),
                                                                       len(yelpDataset[yelpDataset['sentiment'] == 0])))



Out of 1000 rows, 500 are positive (1) and 500 negative (0).


### Any missing data?

In [73]:
print("Number of null in label: {}".format(yelpDataset['sentiment'].isnull().sum()))
print("Number of null in label: {}".format(yelpDataset['review'].isnull().sum()))

Number of null in label: 0
Number of null in label: 0


In [74]:
print(yelpDataset[:])

                                                review  sentiment
0                             Wow... Loved this place.          1
1                                   Crust is not good.          0
2            Not tasty and the texture was just nasty.          0
3    Stopped by during the late May bank holiday of...          1
4    The selection on the menu was great and so wer...          1
5       Now I am getting angry and I want my damn pho.          0
6                Honeslty it didn't taste THAT fresh.)          0
7    The potatoes were like rubber and you could te...          0
8                            The fries were great too.          1
9                                       A great touch.          1
10                            Service was very prompt.          1
11                                  Would not go back.          0
12   The cashier had no care what so ever on what I...          0
13   I tried the Cape Cod ravoli, chicken,with cran...          1
14   I was

### Remove punctuation and uppercases

In [75]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [76]:
import string

def removePunct(text):
    noPunct = ''.join([char for char in text if char not in string.punctuation])
    return noPunct

# .apply() is a pandas method that applies a function across an index/column in a dataframe
yelpDataset['review_clean'] = yelpDataset['review'].apply(lambda x: removePunct(x.lower()))

yelpDataset.head()

Unnamed: 0,review,sentiment,review_clean
0,Wow... Loved this place.,1,wow loved this place
1,Crust is not good.,0,crust is not good
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...


In [77]:
df1 = pd.DataFrame(data = yelpDataset['review_clean'])
#numpy array for review 
df1 = df1['review_clean'].tolist()
df2 = pd.DataFrame(data = yelpDataset['sentiment'])
#numpy array for sentiment 
df2 = df2['sentiment'].tolist()

print (df1[:5])
print (df2[:5])

['wow loved this place', 'crust is not good', 'not tasty and the texture was just nasty', 'stopped by during the late may bank holiday off rick steve recommendation and loved it', 'the selection on the menu was great and so were the prices']
[1, 0, 0, 1, 1]
