<a href="https://colab.research.google.com/github/SundareshSankaran/snippets-ssmd/blob/master/DS_Your_Vacation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# pandas - data manipulation
import pandas as pd
# matrix calculations and stuff
import numpy as np
# plotting
import matplotlib.pyplot as plt
# imported for no reason
import scipy
import sklearn
# seaborn - data visualization library
import seaborn as sns
# Regex
import re
# counter 
from collections import Counter

# plots inline
%matplotlib inline

#**Hotel Review Workshop Notebook**


This notebook will guide you through the creation of a simple bag of words model for text matching.

In [0]:
# Import the Data Set.
data = pd.read_csv('https://github.com/Thinkful-Ed/data-201-resources/raw/master/hotel-reviews.csv')

In [0]:
# Perform some basic cleaning and character removal.

# Make everything lower case.
data['reviews.text'] = data['reviews.text'].str.lower()

# Remove non-text characters.
data['reviews.text'] = data['reviews.text'].str.replace(r'\.|\!|\?|\'|,|-|\(|\)', "",)

# Fill in blank reviews with '' rather than Null (which would give us errors).
# Remember - fills in null with '' - fillna
data['reviews.text'] = data['reviews.text'].fillna('')

In [0]:
# Import and initiate a vectorizer.
from sklearn.feature_extraction.text import CountVectorizer

# The max features is how many words we want to allow us to create columns for.
vectorizer = CountVectorizer(max_features=5000)

In [12]:
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [0]:
# Vectorize our reviews to transform sentences into volumns.
X = vectorizer.fit_transform(data['reviews.text'])

# And then put all of that in a table.
bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [9]:
bag_of_words.head(10)

Unnamed: 0,00,00am,00pm,10,100,1000,10000,101,1015,10am,...,yummy,zero,zimmer,zion,zona,zone,zoo,zu,zum,zur
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
# Rename some columns for clarity.
data.rename(columns={'address': 'hotel_address', 'city': 'hotel_city',
                     'country':'hotel_country', 'name':'hotel_name'},
            inplace=True)

# Join our bag of words back to our initial hotel data.
full_df = data.join(bag_of_words)

In [11]:
full_df.head(10)

Unnamed: 0,hotel_address,categories,hotel_city,hotel_country,latitude,longitude,hotel_name,postalCode,province,reviews.date,...,yummy,zero,zimmer,zion,zona,zone,zoo,zu,zum,zur
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
5,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-05T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
6,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-06-10T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
7,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-05-14T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0
8,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-09-14T00:00:00Z,...,0,0,1,0,0,0,0,0,0,0
9,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-05-16T00:00:00Z,...,0,0,0,0,0,0,0,0,0,0


In [0]:
# X is our words.
X = bag_of_words

# Y is our hotel name (the outcome we care about).
Y_hotel = data['hotel_name']

In [14]:
# Import a random forest model.
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

# Fit that random forest model to our data.
rfc.fit(X,Y_hotel)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

# If you want to run a different test review, start from here.

In [0]:
# Write your own dream vacation review here...
# test_review = ['''
#     I loved the beach and the sunshine and the clean and modern room.
#     ''']
test_review = ['''
    I hated this place because of poor customer service.
    ''']

In [0]:
# Convert your test review into a vector.
X_test = vectorizer.transform(test_review).toarray()

In [30]:
print(vectorizer.transform(test_review))

  (0, 511)	1
  (0, 1194)	1
  (0, 2078)	1
  (0, 3035)	1
  (0, 3281)	1
  (0, 3334)	1
  (0, 3877)	1
  (0, 4418)	1


In [0]:
# Match your review.
prediction = rfc.predict(X_test)[0]

In [32]:
# Return the essential information about your match.
data[data['hotel_name'] == prediction][['hotel_name', 'hotel_address', 
                                        'hotel_city', 'hotel_country']].head(1)

Unnamed: 0,hotel_name,hotel_address,hotel_city,hotel_country
1038,Quality Inn,12439 Northwest Fwy,Houston,US


In [27]:
print(prediction)

Hyatt Place Pittsburgh Cranberry
