In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import re
from collections import Counter
%matplotlib inline

In [16]:
data = pd.read_csv('https://github.com/Thinkful-Ed/data-201-resources/raw/master/hotel-reviews.csv')

In [17]:
data.head(5)

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


---

Cleaning Data and Pre-Processing

In [29]:
# Make everything lower case.
data['reviews.text'] = data['reviews.text'].str.lower()

# Remove non-text characters.
data['reviews.text'] = data['reviews.text'].str.replace(r'\.|\!|\?|\'|,|-|\(|\)', "",)

# Fill in blank reviews with '' rather than Null (which would give us errors).
data['reviews.text'] = data['reviews.text'].fillna('')

---

Calling in specific columns .loc can actually subset more cells look at the pandas cheatsheet

In [33]:
data[data['hotel_country'] == 'US']

Unnamed: 0,hotel_address,categories,hotel_city,hotel_country,latitude,longitude,hotel_name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,really lovely hotel stayed on the very top flo...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,ett mycket bra hotell det som drog ner betyget...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,we stayed here for four nights in october the ...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,we stayed here for four nights in october the ...,������ ���������������,,sungchul,
5,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,we loved staying on the island of lido you nee...,Very nice hotel,,A Traveler,
6,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-06-10T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,lovely view out onto the lagoon excellent view...,Lovely view out onto the lagoon. Excellent view.,,A Traveler,
7,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-05-14T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,ottimo soggiorno e ottima sistemazione nei gio...,Lovely view out onto the lagoon. Excellent view.,,A Traveler,
8,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-09-14T00:00:00Z,2016-10-24T00:00:25Z,,,3.0,gnstiger ausgangspunkt fr venedig besuche ruhi...,G��nstige Lage,,Doppeldecker,
9,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-05-16T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,lidoen er perfekt til et par dages ro og afsla...,Ro og hygge,,A Traveler,


In [32]:
data['hotel_country']

0        US
1        US
2        US
3        US
4        US
5        US
6        US
7        US
8        US
9        US
10       US
11       US
12       US
13       US
14       US
15       US
16       US
17       US
18       US
19       US
20       US
21       US
22       US
23       US
24       US
25       US
26       US
27       US
28       US
29       US
         ..
35882    US
35883    US
35884    US
35885    US
35886    US
35887    US
35888    US
35889    US
35890    US
35891    US
35892    US
35893    US
35894    US
35895    US
35896    US
35897    US
35898    US
35899    US
35900    US
35901    US
35902    US
35903    US
35904    US
35905    US
35906    US
35907    US
35908    US
35909    US
35910    US
35911    US
Name: hotel_country, Length: 35912, dtype: object

---

Vectorized Operations

In [20]:
# Import and initiate a vectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# The max features is how many words we want to allow us to create columns for.
vectorizer = CountVectorizer(max_features=5000)

In [21]:
# Vectorize our reviews to transform sentences into columns.
X = vectorizer.fit_transform(data['reviews.text'])

# And then put all of that in a table.
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [22]:
# Rename some columns for clarity.
data.rename(columns={'address': 'hotel_address', 'city': 'hotel_city',
                     'country':'hotel_country', 'name':'hotel_name'},
            inplace=True)

# Join our bag of words back to our initial hotel data.
full_df = data.join(bag_of_words)

In [23]:
# X is our words.
X = bag_of_words

# Y is our hotel name (the outcome we care about).
Y_hotel = data['hotel_name']

In [24]:
# Import a random forest model.
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# Fit that random forest model to our data. (.fit transforms the model to a training model)
rfc.fit(X,Y_hotel)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

---

Running Tests

In [25]:
# Write your own dream vacation review here...
test_review = ['''
    I loved the beach and the sunshine and the clean and modern room.
    ''']

In [26]:
# Convert your test review into a vector.
X_test = vectorizer.transform(test_review).toarray()

In [27]:
# Match your review.
prediction = rfc.predict(X_test)[0]

In [28]:
# Return the essential information about your match.
data[data['hotel_name'] == prediction][['hotel_name', 'hotel_address', 
                                        'hotel_city', 'hotel_country']].head(1)

Unnamed: 0,hotel_name,hotel_address,hotel_city,hotel_country
4744,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US


In [35]:
data[data['hotel_name'] == 'Super 8 Kansas City Airport']

Unnamed: 0,hotel_address,categories,hotel_city,hotel_country,latitude,longitude,hotel_name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
25988,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2015-08-15T00:00:00Z,2016-11-16T04:02:09Z,,,2.0,i was very unhappy when we got there when i bo...,,,Edie,
25989,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-05-11T00:00:00Z,2016-11-16T04:02:09Z,,,3.0,it was ok for a one night stay prior to flying...,Convenient to MCI,,Edie,
25990,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-06-23T00:00:00Z,2016-11-16T04:02:09Z,,,4.0,great spot near airport food was a 5 minute dr...,Great place to rest your head on your travels.,,James,
25991,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2015-09-02T00:00:00Z,2016-11-16T04:02:09Z,,,2.0,the door in our room was very ajare very loose...,just for the nite,,A Traveler,
25992,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-01-12T00:00:00Z,2016-11-16T04:02:09Z,,,1.0,when we arrived the hotel clerk rushed us thro...,Not a clean option,,Dawn,
25993,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-06-10T00:00:00Z,2016-11-16T04:02:09Z,,,1.0,horrible experience none of the online picture...,"Hotel a war zone, unclean, spiders & gunfire o...",,A Traveler,
25994,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2015-12-23T00:00:00Z,2016-11-16T04:02:09Z,,,1.0,accused of vandalizing the room despite not kn...,Accused of vandalizim,,Pader,
25995,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-09-08T00:00:00Z,2016-11-16T04:02:09Z,,,4.0,clean convenient to airport helpful front desk...,Convenient and clean hotel.,,A Traveler,
25996,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2016-09-01T00:00:00Z,2016-11-16T04:02:09Z,,,5.0,nice location,Super Great #8,,A Traveler,
25997,11900 N W Plaza Cir,Hotels,Kansas City,US,39.309014,-94.6826,Super 8 Kansas City Airport,64153,MO,2015-10-20T00:00:00Z,2016-11-16T04:02:09Z,,,2.0,bath tub was dirty and towels were old and sta...,Super Great #8,,A Traveler,


---

The decision tree model process

In [38]:
rfc.decision_path(X)

(<35912x390488 sparse matrix of type '<class 'numpy.int64'>'
 	with 88991200 stored elements in Compressed Sparse Row format>,
 array([     0,  39461,  78722, 117877, 156456, 195663, 234634, 273237,
        312184, 351307, 390488], dtype=int32))

In [39]:
rfc.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1849704594, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False,
             random_state=1853809871, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_we

In [40]:
estimator = rfc.estimators_[8]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = vectorizer.get_feature_names(),
                class_names = Y_hotel,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

# Display in jupyter notebook
from IPython.display import Image
Image(filename = 'tree.png')

KeyboardInterrupt: 