## Yelp Business Rating Prediction using Pandas and Sklearn

In [1]:
#importing datasets
import pandas as pd
import os
import json

business=[]
with open('yelp_academic_dataset_business.json') as f:
    for line in f:
        business.append(json.loads(line))
business_df=pd.DataFrame.from_dict(business)

reviews=[]
with open('yelp_academic_dataset_review.json') as f:
    for line in f:
        reviews.append(json.loads(line))
reviews_df=pd.DataFrame.from_dict(reviews)

## Data Preprocessing and Exploration

In [2]:
#know the shape of dataframes
print(business_df.shape)
print(reviews_df.shape)

(188593, 15)
(5996996, 9)


In [3]:
#to display the whole dataframe width
pd.set_option('display.max_colwidth',-1)

In [4]:
#dataframe columns
print(business_df.columns)
print(reviews_df.columns)

Index(['address', 'attributes', 'business_id', 'categories', 'city', 'hours',
       'is_open', 'latitude', 'longitude', 'name', 'neighborhood',
       'postal_code', 'review_count', 'stars', 'state'],
      dtype='object')
Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'stars', 'text',
       'useful', 'user_id'],
      dtype='object')


In [5]:
#overview of business dataframe
business_df.head(1)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,1314 44 Avenue NE,"{'BikeParking': 'False', 'BusinessAcceptsCreditCards': 'True', 'BusinessParking': '{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}', 'GoodForKids': 'True', 'HasTV': 'True', 'NoiseLevel': 'average', 'OutdoorSeating': 'False', 'RestaurantsAttire': 'casual', 'RestaurantsDelivery': 'False', 'RestaurantsGoodForGroups': 'True', 'RestaurantsPriceRange2': '2', 'RestaurantsReservations': 'True', 'RestaurantsTakeOut': 'True'}",Apn5Q_b6Nz61Tq4XzPdf9A,"Tours, Breweries, Pizza, Restaurants, Food, Hotels & Travel",Calgary,"{'Monday': '8:30-17:0', 'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21:0', 'Thursday': '11:0-21:0', 'Friday': '11:0-21:0', 'Saturday': '11:0-21:0'}",1,51.091813,-114.031675,Minhas Micro Brewery,,T2E 6L6,24,4.0,AB


In [6]:
#overview of reviews dataframe
reviews_df.head(1)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,iCQpiavjjPzJ5_3gPD5Ebg,0,2011-02-25,0,x7mDIiDB3jEiPGPHOmDzyw,2,"The pizza was okay. Not the best I've had. I prefer Biaggio's on Flamingo / Fort Apache. The chef there can make a MUCH better NY style pizza. The pizzeria @ Cosmo was over priced for the quality and lack of personality in the food. Biaggio's is a much better pick if youre going for italian - family owned, home made recipes, people that actually CARE if you like their food. You dont get that at a pizzeria in a casino. I dont care what you say...",0,msQe1u7Z_XuqjGoqhB0J5g


In [7]:
#to know datatypes of each column in reviews
reviews_df.dtypes

business_id    object
cool           int64 
date           object
funny          int64 
review_id      object
stars          int64 
text           object
useful         int64 
user_id        object
dtype: object

In [8]:
#checking each column to know how many null values business dataframe has
business_df.isnull().sum()

address         0    
attributes      25786
business_id     0    
categories      541  
city            0    
hours           44802
is_open         0    
latitude        6    
longitude       6    
name            0    
neighborhood    0    
postal_code     0    
review_count    0    
stars           0    
state           0    
dtype: int64

In [9]:
#checking each column to know how many null values reviews dataframe has
reviews_df.isnull().sum()

business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64

In [10]:
#code to group all the reviews by each business
review_df_agg=reviews_df.groupby('business_id')['text'].sum()

In [11]:
#renaming the columns business_id for index and all_reviews for all aggregated reviews
review_df_for_sklearn=pd.DataFrame({'business_id':review_df_agg.index,'all_reviews':review_df_agg.values})

In [12]:
#Merging both reviews and business dataframes on column business_id as it the unique and common column for both dataframes
review_business=pd.merge(review_df_for_sklearn,business_df,on='business_id',how='left')

In [13]:
#checking the shape of merged dataframe
review_business.shape

(188593, 16)

In [14]:
#taking the sample of  10% from the whole dataframe 
review_business_sample=review_business.sample(frac=.10)
print(review_business_sample.shape)

(18859, 16)


In [15]:
#overview of merged dataframe
review_business_sample.head(1)

Unnamed: 0,business_id,all_reviews,address,attributes,categories,city,hours,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
158380,pgvv3Fi0HiD0ateqLcqyZQ,"Wonderful breakfast café! My family and I come every Sunday! Great food, staff, and service!Been wanting to check this place out for quite some time. First time yesterday, Aug 24, wife and I went there for breakfast.\nI ordered the country fried steak and eggs...most expensive item on menu, $12.49. New York steak & eggs cheaper.\nNot a good deal. Flavor was very poor. Gravy had no flavor at all!! Very disappointing. Most flavorful was the Smuckers jelly.\nOne waiter who was obviously overwhelmed with 5 tables!\nWon't be going back!I really love the selection here! The sweet potato pancakes are my favorite indulgence here! I also got an egg white omelette that had goat cheese, spinach, and sun dried tomatoes in it! It was absolutely amazing! You really can't go wrong at this breakfast/lunch spot because it has a homey feel and something on the menu that everyone can enjoy. Love this placeI have eaten her 4 time, its a family or neighborhood kind of place. \nI take my business clients here an the just love the food and the friendly\natmosphere. The food is cooked to order.I love this place! Never felt more at home at a restaurant... The food is incredible! I loved the chicken and waffles! Service was great and servers were very personable. I will definitely be back!My family and I went to eat breakfast there this morning and we were all very impressed! The food was amazing. Things like chicken enchilada omelette, the best burger. They make their own ground beef! The chef Andre is fantastic. Reasonable prices, especially for the quality of the food. We will definitely be back!!Great staff! Great food! Comfortable environment! You should try the homemade pancakes! They are delicious! My husband had the protein omelet and he liked that tooMan what services and , homely first name bases makes it so AWSOME , what a diamond in the ruff this place is a MUSTI've eaten breakfast here a few times. It is always fresh and the menu offers a lot of variety. The service is great - super friendly and helpful. \nA MUST if you want fresh, yummy and casual atmosphere!My 1st visit here. Grabbing a quick brunch with my adult daughter, son, & daughter in law. Cheerful service, flexible menu, comfortable seating. Bright well-lit dining area. Our server said their lunch specialty was burgers, & that they ground their own meat& the burgers were tender & juicy.\nI, however am not a burger person, so I ordered the BLTA with sweet potato fries. It was delish!\nAwaiting their meals now....\nmmmmm, looks, smells & apparently tastes great! The teriyaki grilled chicken w/ side of fruit, a winner, as were the scrambled eggs & pancakes. All cooked to perfection!I eat at restaurants every morning. With the over rated Cracked Eggs of the world and a myriad of chains saturating the city, it is hard to find a homey place that serves good food. Well I found it. Sun City Cafe is an unexpected pleasure. The fruit is cut fresh every day. The omelets are all delicious. The orange pancakes are surprisingly tasty. The gem is the sweet potato waffle, naturally sweetened by the sweet potato, it is both healthier and tastier than any waffle I have eaten.. The service is fantastic. When an error was made, they were quick to fix it. They also made sure I did not pay. Terrific food at a fair price, where they make you feel at home. Highly recommended.It our first time being at this cafe and its great, food selections are amazing and I really liked my ""chicken n waffles"" and my wife loved her breakfast wrap, I will definitely come here again.This was our second visit to Sun City cafe. It only got better the second time around. We all had breakfast and the service was attentive and friendly and the food so far a cut above any other better known places for breakfast in the area.\n\nSince Sun City is a retirement community so all of the items have mild salsa since most seniors don't like spicy food. But there spicy salsa was divine. I had the jalapeno omelet and we also had pancakes and another omelet. Everything was made fresh, served hot and with warm friendly service. Ask for the spicy salsa on the side or tell them to make it spicy.\n\nForget Babystacks, The Cracked Egg and IHOP or denny's. This is the place to go. The menu is large, breakfast is served all day and you can actually sense the pride they have. It seems family run but I could be wrong. They even stock flavored creamers and will ask you which ones you want. the coffee was nothing to brag about though. \n\nThey have inside and outdoor seating, lots of parking and yes even golf cart accessible.\n\nThis will become our standard place when we want to go out for breakfast. We will have to try their lunches next.Another 3.5-4* kinda place IMO, bumped up because the service is stellar and my folks and their posse are regulars. Nestled in a quiet car park in Sun City Summerlin, the Cafe serves up tasty breakfast and lunch offerings with specials daily. Service is what bumped the rating up, the staff is friendly and super kind. \n\nThe dining room is welcoming and there's even a few tables outside for mild weather dining. Breakfast/brunch is the most popular time to hit them up, but there's never a long wait as in many nearby hotspots. Prices are super reasonable and they split checks without breaking balls =) \n\nAside from the ""Mom and Pop"" vibe, the things I most appreciate are the variety of offerings (eggs, pancakes, French toast, burgers, soups, salads and sandwiches...) and their patience with Senior diners. Also, their fruit is very fresh, and that is a big plus for me as many places offer abysmal fruit.\n\nSo, while they aren't going to win any culinary or design innovation awards, it is quite a solid choice to chow down without the chaos of other similar venues.Decent at best. Got pulled into this place because it had 5 stars. Turns out it was only reviewed by one user (probably the owner). I'm fixing that. Not a bad place but not a great one either. Probably on the lower end of the spectrum.Most of our hiking group got together on Friday for breakfast and Sun City Café was recommended by one of our group who had been here and they knew they could handle a group our size (11 or 13 this time--I didn't take roll).\n\nThe server was good enough to ask if we wanted separate checks and he did that for each and every one of us--above and beyond! \n\nI ordered the special of the day which was a Tex/Mex kind of omelette made with pulled pork and topped with salsa verde and some tortilla strips. You had a choice of hash browns or breakfast potatoes with onions and peppers. I went with the breakfast potatoes. Some in the group subbed fresh fruit for potato and toast. I had rye toast.\n\nThe omelette was OK. Even though the kitchen was busy with our orders and the other customers, the wait was not long. It wouldn't have mattered as we were engaged in good conversation and much kidding. \n\nThe restaurant is a very pleasant place and I'm sure I'll be back in the future. There were some pancakes on the menu that sounded great!Having trouble finding a great brunch place with no lines?\n\nThis place is never crowded, even at peak weekend brunch times. And everything we've had was delicious - they even make an awesome chicken fried steak. Would highly recommend for anyone like us with toddlers in tow who can't stand long waits.Underrated cafe...\n\nSun City Cafe is located in the retirement community of Sun City along Del Webb Blvd in Summerlin.\n\nLas Vegas has an interesting array of cafés, ranging from the outlandish to the plain old dumpster dive that you would see on Tv getting closed down. Sun City Cafe is neither outlandish nor a a dump. This is your local cafe that mom and pop would come to.\n\nThe times that I have come here I've never had a single complaint. The service is always great, the staff have always been friendly and kind. Comparing to cafés cracked egg etc lines are usually non existent, except Maybe on Sunday afternoons, there are quite a few churches in Sun City.\n\nThe recommendations from here would be the French Toast, I found it to be quite tasty. General menu items have a variety of options on the menu from Burgers to vegetarian meals. I never did see a stand out or signature dish that screamed off the page but the menu has something for everyone. Decent service, with decent food.\n\nOverall \nPrice: $$\nFood: 3-5\nDrink: 3-5\nCome back: yesSun City Cafe is one of the best spots in Las Vegas. The service is very fast the food is delicious and very creative. You can get all types of genre in food with Sun City Cafe. From healthy vegetarian to meat eating lovers. It has a mix of Spanish food, Asian cuisin,e and of course good ole American burgers. You can never go wrong with Sun City Cafe!LOVE this place! This is hands down my favorite spot for a weekend brunch. With great food, extremely friendly staff, and a pet-friendly patio, this is a weekend staple!Good food, clean, friendly , medium wait time. Good semi quiet atmosphere. Can't go wrong here. !!Awesome mom and pops joint! Great service! Not to mention the food was amazing. I am a Vegan and they completely customized my meal to cater to my needs. I will definitely be back! Check it out :-)Costco business center food. Just your very basic breakfast at a decent price.","9320 Sun City Blvd, Ste 101","{'Alcohol': 'none', 'Ambience': '{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}', 'BikeParking': 'False', 'BusinessAcceptsCreditCards': 'True', 'BusinessParking': '{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}', 'GoodForKids': 'True', 'GoodForMeal': '{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'breakfast': True, 'brunch': False}', 'HasTV': 'False', 'NoiseLevel': 'average', 'OutdoorSeating': 'True', 'RestaurantsAttire': 'casual', 'RestaurantsDelivery': 'False', 'RestaurantsGoodForGroups': 'True', 'RestaurantsPriceRange2': '1', 'RestaurantsReservations': 'False', 'RestaurantsTableService': 'True', 'RestaurantsTakeOut': 'True', 'WiFi': 'free'}","Breakfast & Brunch, Restaurants",Las Vegas,,0,36.211716,-115.298329,Sun City Café,Summerlin,89134,23,4.0,NV


In [16]:
#as the review_count has different values it needs to be normalized
from scipy.stats import zscore

review_count_normalized=zscore(review_business_sample['review_count'])
review_count_normalized.shape #normalized review_count

(18859,)

## TF-IDF(Term Frequency- Inverse Document Frequency): It is a numerical statistic that is intended to reflect how important a word is in document in a collection. The tf-idf value increases proportionality to the number of times a word appears in the document and is offset by the number of documnets in the corpus that contain the word.

In [17]:
#using tf-idf to do feature extraction from review contents for models
import sklearn.feature_extraction.text as sk_text
vectorizer=sk_text.TfidfVectorizer(stop_words='english',max_features=1000,min_df=2)
matrix=vectorizer.fit_transform(review_business_sample['all_reviews'])

print(type(matrix))
print(matrix.toarray())

#print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.         0.         ... 0.         0.         0.02239041]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.00399828 0.         ... 0.00597962 0.         0.00640026]
 [0.         0.         0.         ... 0.         0.         0.01978279]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [18]:
#converting the matrix to array format
tfif_data=matrix.toarray()


In [19]:
#know the shape of the previous array
tfif_data.shape

(18859, 1000)

In [20]:
#merging review_count and tfid_data arrays
import numpy as np
concat_matrix=np.column_stack((tfif_data,review_count_normalized))
concat_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.02239041, -0.08720993],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.26487099],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.29622294],
       ...,
       [ 0.        ,  0.00399828,  0.        , ...,  0.        ,
         0.00640026,  0.22630959],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.01978279, -0.00360472],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , -0.29622294]])

## Models

## Linear Regression - Regression Model

In [21]:
#applying linear regression model by giving X as concat matrix and y as stars column

# import the class
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

X=concat_matrix
y=review_business_sample['stars']

#dividing the data in to training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2,random_state =1)
print(X_train.shape)
print(X_test.shape)

(15087, 1001)
(3772, 1001)


In [57]:
# instantiate the model (using the default parameters)
lr=LinearRegression()

# fit the model with data
lr.fit(X_train,y_train)

pred = lr.predict(X_test)

score = metrics.mean_squared_error(pred,y_test)
print("Final score (MSE): {}".format(score))

# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))

Final score (MSE): 0.2735314297851912
Final score (RMSE): 0.5230023229252344


## Classification Models

In [27]:
import pandas as pd
import numpy as np
import os , io
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#one hot encoding on stars column 
le = preprocessing.LabelEncoder()
le.fit(review_business_sample['stars'])

list(le.classes_)

y = le.transform(review_business_sample['stars'])

In [28]:
#splitting the data in to training and test datasets
X_train, X_test,y_train,y_test = train_test_split(concat_matrix,y,test_size=0.25,random_state=1)

## Support Vector Machine Model

In [29]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [30]:
from sklearn import metrics
import matplotlib.pyplot as plt
print('Accuracy of SVM on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of SVM on test set: 0.18


## Nearest Neighbors Model

In [67]:
#knn classfrom sklearn.neighbors import KNeighborsClassifierifier
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [68]:
from sklearn import metrics
print('Accuracy of nearest neighbor classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))

Accuracy of nearest neighbor classifier on test set: 0.28


## Logistic Regression Model

In [63]:
#logistic Regression

# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [65]:
from sklearn import metrics
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test))) 


Accuracy of logistic regression classifier on test set: 0.40


## Min-Max Normalization for Multionominal Naive Bayes as it does not take negative values which happens in z-score normalization.

In [70]:
minmax=review_business_sample['review_count'].max()-review_business_sample['review_count'].min()
nmn_reviewcount=(review_business_sample['review_count']-review_business_sample['review_count'].min())/minmax
nmn_rc_nparray=nmn_reviewcount.values
nmn_rc_nparray.shape
import numpy as np
concat_matrix=np.column_stack((tfif_data,nmn_rc_nparray))
X=concat_matrix

## Multi-Nominal Naive Bayes Model

In [71]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB()
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.3,random_state=0)

clf.fit(X_train,y_train)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [72]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

y_pred=clf.predict(X_test)
print('Accuracy of Multinomial on test set: {:.2f}'.format(clf.score(X_test, y_test)))

Accuracy of Multinomial on test set: 0.32


## Yelp Business Rating Prediction using Tensor Flow

In [73]:
import tensorflow as tf
print("Tensor Flow Version: {}".format(tf.__version__))

  from ._conv import register_converters as _register_converters


Tensor Flow Version: 1.11.0


In [74]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

x=concat_matrix

le = preprocessing.LabelEncoder()
le.fit(review_business_sample['stars'])

list(le.classes_)

y = le.transform(review_business_sample['stars'])

# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

Using TensorFlow backend.


# Tensorflow Regression

In [75]:
from keras.models import Sequential
from keras.layers.core import Dense, Activation
import pandas as pd
import io
import requests
import numpy as np
from sklearn import metrics

model = Sequential()

model.add(Dense(25, input_dim=x.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output

model.compile(loss='mean_squared_error', optimizer='adam')

model.fit(x,y,verbose=2,epochs=100)    # Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch.

pred = model.predict(x)
print("Shape: {}".format(pred.shape))
print(pred)

# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y))
print("Final score (RMSE): {}".format(score))

Epoch 1/100
 - 22s - loss: 5.1411
Epoch 2/100
 - 1s - loss: 1.2170
Epoch 3/100
 - 1s - loss: 1.0568
Epoch 4/100
 - 1s - loss: 1.0013
Epoch 5/100
 - 1s - loss: 0.9167
Epoch 6/100
 - 1s - loss: 0.8254
Epoch 7/100
 - 1s - loss: 0.7559
Epoch 8/100
 - 1s - loss: 0.6939
Epoch 9/100
 - 1s - loss: 0.6362
Epoch 10/100
 - 1s - loss: 0.5842
Epoch 11/100
 - 1s - loss: 0.5342
Epoch 12/100
 - 1s - loss: 0.4953
Epoch 13/100
 - 1s - loss: 0.4596
Epoch 14/100
 - 1s - loss: 0.4307
Epoch 15/100
 - 1s - loss: 0.4088
Epoch 16/100
 - 1s - loss: 0.3881
Epoch 17/100
 - 1s - loss: 0.3711
Epoch 18/100
 - 1s - loss: 0.3531
Epoch 19/100
 - 1s - loss: 0.3384
Epoch 20/100
 - 1s - loss: 0.3272
Epoch 21/100
 - 1s - loss: 0.3128
Epoch 22/100
 - 1s - loss: 0.3035
Epoch 23/100
 - 1s - loss: 0.2915
Epoch 24/100
 - 1s - loss: 0.2815
Epoch 25/100
 - 1s - loss: 0.2729
Epoch 26/100
 - 1s - loss: 0.2667
Epoch 27/100
 - 1s - loss: 0.2576
Epoch 28/100
 - 1s - loss: 0.2484
Epoch 29/100
 - 1s - loss: 0.2433
Epoch 30/100
 - 1s - l

In [79]:
# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath="/Users/ravalik/Desktop/215/best_weight1.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1)) # Output

    model.compile(loss='mean_squared_error', optimizer='rmsprop')

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')  
print()
model.load_weights("/Users/ravalik/Desktop/215/best_weight1.hdf5") # load weights from best model

# Predict
pred = model.predict(x_test)

# Measure MSE error.  
score = metrics.mean_squared_error(pred,y_test)
print("Final score (MSE): {}".format(score))

# Measure RMSE error.  RMSE is common for regression.
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Final score (RMSE): {}".format(score))



0
Train on 14144 samples, validate on 4715 samples
Epoch 1/100
 - 18s - loss: 2.2930 - val_loss: 1.2893
Epoch 2/100
 - 1s - loss: 1.0313 - val_loss: 1.1105
Epoch 3/100
 - 1s - loss: 0.8669 - val_loss: 1.0184
Epoch 4/100
 - 1s - loss: 0.7490 - val_loss: 1.0960
Epoch 5/100
 - 1s - loss: 0.6400 - val_loss: 1.0628
Epoch 6/100
 - 1s - loss: 0.5486 - val_loss: 1.0629
Epoch 7/100
 - 1s - loss: 0.4600 - val_loss: 1.1529
Epoch 8/100
 - 1s - loss: 0.3829 - val_loss: 1.1153
Epoch 00008: early stopping
1
Train on 14144 samples, validate on 4715 samples
Epoch 1/100
 - 11s - loss: 2.2992 - val_loss: 1.7150
Epoch 2/100
 - 1s - loss: 1.0152 - val_loss: 1.0788
Epoch 3/100
 - 1s - loss: 0.8585 - val_loss: 1.1452
Epoch 4/100
 - 1s - loss: 0.7414 - val_loss: 1.2899
Epoch 5/100
 - 1s - loss: 0.6383 - val_loss: 1.0270
Epoch 6/100
 - 1s - loss: 0.5323 - val_loss: 1.0560
Epoch 7/100
 - 1s - loss: 0.4374 - val_loss: 1.0762
Epoch 8/100
 - 1s - loss: 0.3632 - val_loss: 1.1083
Epoch 9/100
 - 1s - loss: 0.3089 - v

## Tensor Flow Classification

In [80]:
import pandas as pd
import io , os
import requests
import numpy as np
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing

le1 = preprocessing.LabelEncoder()
le1.fit(review_business_sample['stars'])

list(le1.classes_)
y1=le1.transform(review_business_sample['stars'])


In [81]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)


y1= encode_text_dummy(review_business_sample,"stars")   #one-hot coding
y1= review_business_sample
print(y1.columns)

Index(['business_id', 'all_reviews', 'address', 'attributes', 'categories',
       'city', 'hours', 'is_open', 'latitude', 'longitude', 'name',
       'neighborhood', 'postal_code', 'review_count', 'state', 'stars-1.0',
       'stars-1.5', 'stars-2.0', 'stars-2.5', 'stars-3.0', 'stars-3.5',
       'stars-4.0', 'stars-4.5', 'stars-5.0'],
      dtype='object')


In [82]:
y1=y1[['stars-1.0', 'stars-1.5','stars-2.0', 'stars-2.5', 'stars-3.0', 'stars-3.5', 'stars-4.0','stars-4.5', 'stars-5.0']]
y1=y1.values
type(y1)
y1

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint8)

In [83]:
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y1, test_size=0.25, random_state=42)


# Define ModelCheckpoint outside the loop
checkpointer = ModelCheckpoint(filepath="/Users/ravalik/Desktop/215/best_weights1.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    model.add(Dense(20, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(y1.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')  
print()
model.load_weights("/Users/ravalik/Desktop/215/best_weights1.hdf5",) # load weights from best model

# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)

y_true = np.argmax(y_test,axis=1)

score = metrics.accuracy_score(y_true, pred)
print("Final accuracy: {}".format(score))

0
Train on 14144 samples, validate on 4715 samples
Epoch 1/100
 - 15s - loss: 1.8527 - val_loss: 1.5115
Epoch 2/100
 - 1s - loss: 1.3658 - val_loss: 1.3322
Epoch 3/100
 - 1s - loss: 1.2422 - val_loss: 1.2760
Epoch 4/100
 - 1s - loss: 1.1797 - val_loss: 1.2623
Epoch 5/100
 - 1s - loss: 1.1416 - val_loss: 1.2508
Epoch 6/100
 - 1s - loss: 1.1150 - val_loss: 1.2512
Epoch 7/100
 - 1s - loss: 1.0937 - val_loss: 1.2529
Epoch 8/100
 - 1s - loss: 1.0773 - val_loss: 1.2564
Epoch 9/100
 - 1s - loss: 1.0595 - val_loss: 1.2573
Epoch 10/100
 - 1s - loss: 1.0434 - val_loss: 1.2647
Epoch 00010: early stopping
1
Train on 14144 samples, validate on 4715 samples
Epoch 1/100
 - 6s - loss: 1.7525 - val_loss: 1.4384
Epoch 2/100
 - 1s - loss: 1.3261 - val_loss: 1.3038
Epoch 3/100
 - 1s - loss: 1.2185 - val_loss: 1.2652
Epoch 4/100
 - 1s - loss: 1.1664 - val_loss: 1.2472
Epoch 5/100
 - 1s - loss: 1.1273 - val_loss: 1.2433
Epoch 6/100
 - 1s - loss: 1.1016 - val_loss: 1.2539
Epoch 7/100
 - 1s - loss: 1.0736 - v

# Final Results

# Linear Regression Model = 52.3%
# Support Vector Machine Model = 18%
# Nearest neighbors Model = 28%
# Logistic Regression = 40%
# Multinominal Naive Bayes = 32 %
# Tensor Flow Regression = 99.78%
# Tensor Flow Classification = 49.73%
