In [9]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords


In [2]:
# read in data
url ="https://usc-bootcamp-yelpreview-text-analysis.s3.us-east-2.amazonaws.com/reviews.csv"
reviews = pd.read_csv(url)
reviews = reviews.iloc[:,0:2]
reviews.head()


Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,5 star rating
1,The dude and I came to this Panda Express arou...,5 star rating
2,I ordered 5 total plates fried rice chow mai...,1 star rating
3,I always order Panda Express from here and the...,3 star rating
4,Decided to try Panda Expess one more time.Corp...,5 star rating


In [7]:
def rating_category(rating:str)->str:
  """create new column for label
  """
  if rating in ["1 star rating"]:
      return "bad"
  elif rating in ["2 star rating", "3 star rating"]:
      return "descent"
  else: 
      return "good"

def average_word_length(word_list)->int:
    """calculate the average word length in each review
    """
    word_length = []
    for word in word_list: 
        word_length.append(len(word))
    return np.mean(word_length)

assert average_word_length(["test", "test12"])==5
assert rating_category("1 star rating")=="bad"



In [5]:
# use defined function above to re-create rating column
reviews["rating"] = reviews["rating"].apply(rating_category)
reviews.head()

Unnamed: 0,reviews,rating
0,Panda Express was on point tonight! I ordered ...,good
1,The dude and I came to this Panda Express arou...,good
2,I ordered 5 total plates fried rice chow mai...,bad
3,I always order Panda Express from here and the...,descent
4,Decided to try Panda Expess one more time.Corp...,good


In [6]:
# create word count, and character count
reviews["word_list"] = reviews["reviews"].apply(lambda x: x.split())
reviews["word_count"] = reviews["word_list"].apply(lambda x: len(x))
reviews["char_count"] = reviews["reviews"].apply(lambda x: len(x))
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261


In [8]:
reviews["average_word_length"] = reviews["word_list"].apply(average_word_length)
reviews.head()

Unnamed: 0,reviews,rating,word_list,word_count,char_count,average_word_length
0,Panda Express was on point tonight! I ordered ...,good,"[Panda, Express, was, on, point, tonight!, I, ...",63,334,4.253968
1,The dude and I came to this Panda Express arou...,good,"[The, dude, and, I, came, to, this, Panda, Exp...",149,770,4.167785
2,I ordered 5 total plates fried rice chow mai...,bad,"[I, ordered, 5, total, plates, fried, rice, ch...",28,151,4.357143
3,I always order Panda Express from here and the...,descent,"[I, always, order, Panda, Express, from, here,...",122,628,4.155738
4,Decided to try Panda Expess one more time.Corp...,good,"[Decided, to, try, Panda, Expess, one, more, t...",41,261,5.390244


In [13]:
# remove stop words, and get stopwords percentage
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each