In [54]:
import torch
import numpy as np

def convert_review_to_word_list(review):
    """
    Should convert a string containing a review into a list of words in that review
    Things to consider: all lowercase? strip punctuation? keep apostrophes?
    """
    
    for char in review:
        if(not((ord(char) >= 65 and ord(char) <= 90) or (ord(char) >= 97 and ord(char) <= 122))):
            review = review.replace(char, " ");
    
    """ tokenize time """ 
    review = review.split(" ");
    return review;

def generate_dictionary(reviews):
    """
    Given the list of reviews, create a list of all unique words that appear in the review.
    (Sorting a list is a good way to check that all of the words are unique)
    Maybe compare the length of your list with a friend to make sure you did it right. :)
    """
    
    listOfReviewLists = []
    
    for review in reviews:
        listOfReviewLists.append(convert_review_to_word_list(review));
        
    wordList = {}
    
    index = 0;
    
    for reviewList in listOfReviewLists:
        for word in reviewList:
            if word not in wordList:
                wordList[word] = index
                index += 1
            
    return wordList;

def review_to_vector(review, dictionary):
    """
    Given a review as a string, generate a vector where the ith element is the number of times that
    the ith word in the dictionary appears in the review.
    """
    
    wordVector = []
    
    """ set up wordVector """
    for element in dictionary:
        wordVector.append(0);
        
    reviewList = convert_review_to_word_list(review)
    
    for word in reviewList:
        if word in dictionary:
            wordVector[dictionary[word]] += 1
            
    return wordVector
        
def bag_of_words(reviews):
    """
    Given a list of reviews, return a list bagOfWords of length reviews.length,
    with bagOfWords[i] being a list containing the word frequency in reviews[i]
    """
    
    bagOfWords = []
    
    for review in reviews:
        bagOfWords.append(review_to_vector(review, dict))
        
    return bagOfWords
    

reviews = [
    "My absolute favorite Thai food EVER! I've been coming here for year! I now live in Chicago but every time I'm down in Champaign I insist on Bangkok Thai. Their chicken red curry is the best I've ever had! I've had Thai in every major city in the country and this has yet to be beat.",
    "I stopped in for a quick dinner with some friends.  I ordered the chicken pad thai and had crab rangoon for an appetizer.  I like how you can request the different level of spice. I chose level 4, but wish I went spicier!  I thought the food was very good, cheap, quick, and convenient.  I would definitely be back!",
    "Very good price, large portions size, and they are very quick to prepare your order.  Not a fancy restaurant but good for a quick lunch.  The food is very good and you can choose how spicy you want your food.",
    "Tried yellow curry and basil chicken. Both are too overcooked. Chicken are too chewy. The meals are not as amazing as others said. disappointment",
];

In [55]:
print(convert_review_to_word_list(reviews[0]))

print("_____")

dict = generate_dictionary(reviews)
print(dict)

print(review_to_vector(reviews[0], dict))

print(bag_of_words(reviews))

['My', 'absolute', 'favorite', 'Thai', 'food', 'EVER', '', 'I', 've', 'been', 'coming', 'here', 'for', 'year', '', 'I', 'now', 'live', 'in', 'Chicago', 'but', 'every', 'time', 'I', 'm', 'down', 'in', 'Champaign', 'I', 'insist', 'on', 'Bangkok', 'Thai', '', 'Their', 'chicken', 'red', 'curry', 'is', 'the', 'best', 'I', 've', 'ever', 'had', '', 'I', 've', 'had', 'Thai', 'in', 'every', 'major', 'city', 'in', 'the', 'country', 'and', 'this', 'has', 'yet', 'to', 'be', 'beat', '']
_____
{'My': 0, 'absolute': 1, 'favorite': 2, 'Thai': 3, 'food': 4, 'EVER': 5, '': 6, 'I': 7, 've': 8, 'been': 9, 'coming': 10, 'here': 11, 'for': 12, 'year': 13, 'now': 14, 'live': 15, 'in': 16, 'Chicago': 17, 'but': 18, 'every': 19, 'time': 20, 'm': 21, 'down': 22, 'Champaign': 23, 'insist': 24, 'on': 25, 'Bangkok': 26, 'Their': 27, 'chicken': 28, 'red': 29, 'curry': 30, 'is': 31, 'the': 32, 'best': 33, 'ever': 34, 'had': 35, 'major': 36, 'city': 37, 'country': 38, 'and': 39, 'this': 40, 'has': 41, 'yet': 42, 'to'