In [2]:
import pandas as pd
import ijson
import numpy as np
import json
import io
import os

In [3]:
## We choose data from Las Vegas as an illustrative example.

data = pd.read_csv('LasVegas_tips_after2017.csv')

In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,business_id,text,date,attributes,categories,city,name,stars,review_count
0,8,KPgyqG3MyFUDK7GRbUg51A,Kelly Paun was so full of knowledge and was so...,2017-03-23,,"Landmarks & Historical Buildings, Travel Servi...",Las Vegas,Pink Jeep Tours,4.5,136
1,20,JtNVcqioJhS8GZ-If30oHg,Make sure you sign up for Emerald club before ...,2018-03-29,,"Automotive, Hotels & Travel, Car Rental",Las Vegas,National Car Rental,4.0,224
2,34,gh6__q2WXFuyN8gt6VAnWw,Very delicious food with very fast delivery!,2017-09-24,"{'Alcohol': 'full_bar', 'Ambience': ""{'romanti...","Seafood, Restaurants, Steakhouses, Greek, Fren...",Las Vegas,Vila Algarve,4.5,189
3,55,gh6__q2WXFuyN8gt6VAnWw,Amazing Portuguese food! Linguica (chirizo) is...,2017-12-11,"{'Alcohol': 'full_bar', 'Ambience': ""{'romanti...","Seafood, Restaurants, Steakhouses, Greek, Fren...",Las Vegas,Vila Algarve,4.5,189
4,56,gh6__q2WXFuyN8gt6VAnWw,All the way from Porto portugal lost off love!...,2018-04-10,"{'Alcohol': 'full_bar', 'Ambience': ""{'romanti...","Seafood, Restaurants, Steakhouses, Greek, Fren...",Las Vegas,Vila Algarve,4.5,189


In [5]:
data.dtypes

Unnamed: 0        int64
business_id      object
text             object
date             object
attributes       object
categories       object
city             object
name             object
stars           float64
review_count      int64
dtype: object

## Define a categorical variable

In [6]:
data.index

RangeIndex(start=0, stop=65280, step=1)

In [7]:
stars = np.array(data['stars'])

In [8]:
##1.0 means good; 0.0 means not good, bad ...
stars[stars<4.0] = 0.0
stars[stars>=4.0] = 1.0
data['class'] = stars

In [9]:
data.head(15);

### A glimpse onto the statistics of the class variable

In [10]:
data['stars'].describe()

count    65280.000000
mean         3.908058
std          0.703868
min          1.000000
25%          3.500000
50%          4.000000
75%          4.500000
max          5.000000
Name: stars, dtype: float64

In [11]:
data['class'].describe()

count    65280.000000
mean         0.672733
std          0.469219
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: class, dtype: float64

### Combine the tips column with the stars column

In [13]:
# Create a new column which combines "text" and "stars"

# Turn stars into strings, add a space in front of each item

data['stars'] = data['stars'].map(str).apply(lambda x: ' ' + x);

data['tips_stars'] = data["text"] + data['stars']

In [14]:
data.iloc[98]['tips_stars']

"Park valet. It'll be the best $20 you spend because it gives you free valet parking nearly everywhere else (like Bellagio). Any MGM resort on the Strip is included, and there's many.  3.0"

### Split the dataset into training set and test set 

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_set, test_set = train_test_split(
...     data, test_size=0.33, random_state=42)

### tf-idf

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer =  TfidfVectorizer(max_features=1200,stop_words='english',ngram_range=(1,3))

In [18]:
vectorizer.fit(data['tips_stars'].values.astype('U'))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1200, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
train_tfidf = vectorizer.transform(train_set['tips_stars'].values.astype('U'))

In [20]:
test_tfidf = vectorizer.transform(test_set['tips_stars'].values.astype('U'))

In [21]:
train_tfidf.shape

(43737, 1200)

In [22]:
test_tfidf.shape

(21543, 1200)

In [176]:
#print(vectorizer.get_feature_names())

In [130]:
#print(test_tfidf)

In [61]:
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### K Means

In [23]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from sklearn_pandas import DataFrameMapper

## The output of K_Means is an array of labels. 

In [24]:
K = 16
KMeans_model = KMeans(n_clusters=K, init='k-means++', max_iter=100, n_init=1)
KMeans_model.fit(train_tfidf)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=16, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [26]:
KMeans_model.predict(train_tfidf)

array([ 2,  5,  5, ...,  5, 12,  5], dtype=int32)

In [27]:
order_centroids = KMeans_model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(K):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Cluster 0:
 mango
 rice
 tea
 try
 green tea
 green
 good
 boba
 milk
 ice
Cluster 1:
 food amazing
 amazing
 food
 food amazing service
 amazing service
 service
 great
 amazing food
 place
 great food
Cluster 2:
 good
 food
 good food
 service
 food good
 good service
 really good
 really
 service good
 great
Cluster 3:
 best place
 best
 place
 vegas
 place vegas
 place town
 town
 eat
 place eat
 great
Cluster 4:
 place
 great place
 great
 awesome
 nice
 favorite
 good
 nice place
 awesome place
 place great
Cluster 5:
 service
 awesome
 don
 food
 great
 time
 just
 nice
 friendly
 excellent
Cluster 6:
 guys
 great
 thanks
 awesome
 love
 job
 best
 service
 work
 fast
Cluster 7:
 great
 service
 food
 great service
 great food
 food great
 service great
 food great service
 customer
 customer service
Cluster 8:
 fun
 fun place
 place
 great
 good
 atmosphere
 super
 time
 kids
 friendly
Cluster 9:
 love
 love place
 place
 love love
 food
 great
 love food
 service
 love love lo

## Dimension Reducing: PCA

In [28]:
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

n_components = 50

#truncatedSVD 

In [29]:
svd = TruncatedSVD(n_components)

In [31]:
svd.fit(train_tfidf)

TruncatedSVD(algorithm='randomized', n_components=50, n_iter=5,
       random_state=None, tol=0.0)

In [33]:
svd.transform(train_tfidf)

array([[ 1.54680638e-01,  2.21572572e-01,  3.85007115e-01, ...,
        -4.67318771e-02, -3.91596332e-02,  8.24949614e-04],
       [ 3.89008861e-02,  2.52419395e-02,  5.06642724e-03, ...,
        -3.51334873e-02,  2.90343731e-02,  3.30482631e-02],
       [ 9.43313662e-03,  4.42728750e-03,  3.27046629e-04, ...,
         5.87005142e-03,  5.11046812e-03,  7.46401724e-03],
       ...,
       [ 4.55151613e-02,  9.14722905e-03, -5.32490831e-03, ...,
        -2.80178211e-02, -8.93588100e-02,  2.64744936e-02],
       [ 1.22982806e-01,  1.73237072e-01, -7.08034122e-02, ...,
         3.33315879e-02, -9.56859733e-02,  4.92171534e-02],
       [ 5.13251087e-02,  2.03369681e-02, -1.82036793e-03, ...,
        -6.20539241e-02,  4.09402599e-02,  7.48513358e-02]])

In [36]:
svd.fit_transform(train_tfidf)

array([[ 1.54680443e-01,  2.21569187e-01,  3.85000474e-01, ...,
         7.10866682e-02, -3.49273991e-02,  2.48064727e-01],
       [ 3.89010309e-02,  2.52359260e-02,  5.07008116e-03, ...,
        -3.64508112e-02,  2.03548697e-02,  9.09534719e-03],
       [ 9.43341579e-03,  4.42574995e-03,  3.27547440e-04, ...,
        -4.72404461e-03,  3.94038855e-04,  3.22621335e-03],
       ...,
       [ 4.55148521e-02,  9.15253072e-03, -5.32833820e-03, ...,
         1.52547807e-02,  3.63006687e-02, -1.21090900e-02],
       [ 1.22984157e-01,  1.73233391e-01, -7.08044634e-02, ...,
        -1.34381476e-02,  3.99151882e-02, -6.01916098e-02],
       [ 5.13241221e-02,  2.03364541e-02, -1.81739993e-03, ...,
         6.51001196e-03, -4.56209261e-02,  1.19366507e-01]])

In [37]:
train_tfidf

<43737x1200 sparse matrix of type '<class 'numpy.float64'>'
	with 231554 stored elements in Compressed Sparse Row format>

### Logistic Regression

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
train_target = train_set["class"]

In [45]:
clf = LogisticRegression(random_state=0,solver='lbfgs',multi_class='multinomial').fit(train_tfidf,train_target)

In [46]:
clf.predict(train_tfidf)

array([1., 1., 1., ..., 1., 1., 0.])

In [47]:
clf.predict(test_tfidf)

array([1., 1., 1., ..., 1., 1., 0.])

In [48]:
clf.score(train_tfidf, train_target)

0.7227747673594439

In [49]:
clf.score(test_tfidf,test_set["class"])

0.700505964814557

In [50]:
train_set.count()

Unnamed: 0      43737
business_id     43737
text            43736
date            43737
attributes      42609
categories      43722
city            43737
name            43737
stars           43737
review_count    43737
class           43737
tips_stars      43736
dtype: int64

In [204]:
train_set[train_set["class"]==1.0].count()

Unnamed: 0      29505
business_id     29505
text            29504
date            29505
attributes      28946
categories      29501
city            29505
name            29505
stars           29505
review_count    29505
class           29505
tips_stars      29504
dtype: int64

In [205]:
29505/43737

0.6746004527059469

### Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier

  from numpy.core.umath_tests import inner1d


In [57]:
RF_clf = RandomForestClassifier(n_estimators=10, max_depth=4,random_state=42)

In [58]:
RF_clf.fit(train_tfidf,train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [59]:
RF_clf.score(train_tfidf,train_target)

0.6746233166426595

In [60]:
RF_clf.score(test_tfidf,test_set["class"])

0.6689411873926565

In [83]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

documents = ["This little kitty came to play when I was eating at a restaurant.",
             "Merley has the best squooshy kitten belly.",
             "Google Translate app is incredible.",
             "If you open 100 tab in google you get a smiley face.",
             "Best cat photo I've ever taken.",
             "Climbing ninja cat.",
             "Impressed with google map feedback.",
             "Key promoter extension for Google Chrome."]

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)


 

In [84]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)



KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=2, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [85]:
print("Top terms per cluster:")


Top terms per cluster:


In [86]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print



Cluster 0:
 google
 ninja
 climbing
 app
 feedback
 impressed
 incredible
 translate
 map
 cat
Cluster 1:
 best
 ve
 photo
 taken
 belly
 merley
 kitten
 squooshy
 restaurant
 came


In [None]:
print("\n")
print("Prediction")

Y = vectorizer.transform(["chrome browser to open."])
prediction = model.predict(Y)
print(prediction)

Y = vectorizer.transform(["My cat is hungry."])
prediction = model.predict(Y)
print(prediction)