In [81]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

## Bag of words

In [87]:
text = ['Hi, how are you you?',
        'I am Fine. You?']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data :  ['Hi, how are you you?', 'I am Fine. You?']


In [88]:
bow_converter = CountVectorizer()
bow_converter.fit(text)
words = bow_converter.get_feature_names()
print("Words of Bag-of-words : ",words)

Words of Bag-of-words :  ['am', 'are', 'fine', 'hi', 'how', 'you']


In [89]:
features = bow_converter.transform(text).toarray()
print(features)

[[0 1 0 1 1 2]
 [1 0 1 0 0 1]]


In [90]:
frequency_matrix = pd.DataFrame(features, index=text, 
                                columns=bow_converter.get_feature_names())
frequency_matrix

Unnamed: 0,am,are,fine,hi,how,you
"Hi, how are you you?",0,1,0,1,1,2
I am Fine. You?,1,0,1,0,0,1


## Bag of N-grams

In [91]:
text = ['Hi, how are you?',
        'Fine. You?']
print("Suppose. This is our text data : ",text)

Suppose. This is our text data :  ['Hi, how are you?', 'Fine. You?']


In [95]:
bigram_converter = CountVectorizer(ngram_range=(2,2))
bigram_converter.fit(text)
bigrams = bigram_converter.get_feature_names()
print("Words of Bag-of-words : ",bigrams)

Words of Bag-of-words :  ['are you', 'fine you', 'hi how', 'how are']


In [96]:
features = bigram_converter.transform(text).toarray()
frequency_matrix = pd.DataFrame(features, index=text, columns=bigrams)
frequency_matrix

Unnamed: 0,are you,fine you,hi how,how are
"Hi, how are you?",1,0,1,1
Fine. You?,0,1,0,0


In [97]:
print(f'{words}\nunigram count: {len(words)} \n{bigrams} \nbigram count: {len(bigrams)}')

['am', 'are', 'fine', 'hi', 'how', 'you']
unigram count: 6 
['are you', 'fine you', 'hi how', 'how are'] 
bigram count: 4


## Bag of words Vs. Bag of N-grams

In [98]:
import json 

file = open('data/yelp_academic_dataset_review.json')
json_file = []

for i in range(10000):
    json_file.append(json.loads(file.readline()))
file.close()

review_df = pd.DataFrame(json_file)
review_df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


In [99]:
bow_converter = CountVectorizer()
bigram_converter = CountVectorizer(ngram_range=(2,2))

bow_converter.fit(review_df['text'])
words = bow_converter.get_feature_names()

bigram_converter.fit(review_df['text'])
bigrams = bigram_converter.get_feature_names()

In [100]:
#vocabulary size
print (f'Unigram size : {len(words)} \nBigram size : {len(bigrams)} \
       \n{int(len(bigrams)/len(words))} times bigger vocabulary')

Unigram size : 29185 
Bigram size : 385638        
13 times bigger vocabulary


## Tf-Ids (Term frequency–Inverse document frequency)

<img src="image/tfids.png"  width="400" />


https://stackoverflow.com/questions/42440621/how-term-frequency-is-calculated-in-tfidfvectorizer

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

text = ['it is a puppy', 
        'it is a cat', 
        'it is a kitten', 
        'that is a dog and this is a pen']
tfidf = TfidfVectorizer()
tfidf.fit(text)

tf = tfidf.get_feature_names()
tf

['and', 'cat', 'dog', 'is', 'it', 'kitten', 'pen', 'puppy', 'that', 'this']

In [102]:
features = tfidf.transform(text).toarray()
print(features)

frequency_matrix = pd.DataFrame(features, index=text, columns=tf)
frequency_matrix

[[0.         0.         0.         0.40264194 0.49248889 0.
  0.         0.77157901 0.         0.        ]
 [0.         0.77157901 0.         0.40264194 0.49248889 0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.40264194 0.49248889 0.77157901
  0.         0.         0.         0.        ]
 [0.4052446  0.         0.4052446  0.42294689 0.         0.
  0.4052446  0.         0.4052446  0.4052446 ]]


Unnamed: 0,and,cat,dog,is,it,kitten,pen,puppy,that,this
it is a puppy,0.0,0.0,0.0,0.402642,0.492489,0.0,0.0,0.771579,0.0,0.0
it is a cat,0.0,0.771579,0.0,0.402642,0.492489,0.0,0.0,0.0,0.0,0.0
it is a kitten,0.0,0.0,0.0,0.402642,0.492489,0.771579,0.0,0.0,0.0,0.0
that is a dog and this is a pen,0.405245,0.0,0.405245,0.422947,0.0,0.0,0.405245,0.0,0.405245,0.405245


In [103]:
import math

# puppy
tf = 1
idf = math.log(4/1)

print(f' Tf-Ids value for word/term puppy: {tf*idf} But in real-life implementaion it is different.')

 Tf-Ids value for word/term puppy: 1.3862943611198906 But in real-life implementaion it is different.


## Simple comparison

In [25]:
import json
import pandas as pd

### Prepare Data

In [104]:
# data.rar
# Load Yelp business data
biz_f = open('data/yelp_academic_dataset_business.json')
biz_df = pd.DataFrame([json.loads(x) for x in biz_f.readlines()])
biz_f.close()
biz_df.head()

Unnamed: 0,business_id,full_address,open,categories,city,review_count,name,neighborhoods,longitude,state,stars,latitude,type
0,rncjoVoEFUJGCUoC1JgnUA,"8466 W Peoria Ave\nSte 6\nPeoria, AZ 85345",True,"[Accountants, Professional Services, Tax Servi...",Peoria,3,Peoria Income Tax Service,[],-112.241596,AZ,5.0,33.581867,business
1,0FNFSzCFP_rGUoJx8W7tJg,"2149 W Wood Dr\nPhoenix, AZ 85029",True,"[Sporting Goods, Bikes, Shopping]",Phoenix,5,Bike Doctor,[],-112.105933,AZ,5.0,33.604054,business
2,3f_lyB6vFK48ukH6ScvLHg,"1134 N Central Ave\nPhoenix, AZ 85004",True,[],Phoenix,4,Valley Permaculture Alliance,[],-112.073933,AZ,5.0,33.460526,business
3,usAsSV36QmUej8--yvN-dg,"845 W Southern Ave\nPhoenix, AZ 85041",True,"[Food, Grocery]",Phoenix,5,Food City,[],-112.085377,AZ,3.5,33.39221,business
4,PzOqRohWw7F7YEPBz6AubA,"6520 W Happy Valley Rd\nSte 101\nGlendale Az, ...",True,"[Food, Bagels, Delis, Restaurants]",Glendale Az,14,Hot Bagels & Deli,[],-112.200264,AZ,3.5,33.712797,business


In [105]:
# Load Yelp reviews data
review_file = open('data/yelp_academic_dataset_review.json')
review_df = pd.DataFrame([json.loads(x) for x in review_file.readlines()])
review_file.close()
review_df.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


In [123]:
review_df['text']

0         My wife took me here on my birthday for breakf...
1         I have no idea why some people give bad review...
2         love the gyro plate. Rice is so good and I als...
3         Rosie, Dakota, and I LOVE Chaparral Dog Park!!...
4         General Manager Scott Petello is a good egg!!!...
                                ...                        
229902    I really wanted to like this place because it'...
229903    My husband I stayed here for two nights.  Of c...
229904    Cool atmosphere. A lot of beers on tap and goo...
229905    I have to take a star off for the spotty servi...
229906                                         So cool, yo.
Name: text, Length: 229907, dtype: object

In [106]:
biz_df.columns, review_df.columns

(Index(['business_id', 'full_address', 'open', 'categories', 'city',
        'review_count', 'name', 'neighborhoods', 'longitude', 'state', 'stars',
        'latitude', 'type'],
       dtype='object'),
 Index(['votes', 'user_id', 'review_id', 'stars', 'date', 'text', 'type',
        'business_id'],
       dtype='object'))

In [50]:
biz_df['categories']

0        [Accountants, Professional Services, Tax Servi...
1                        [Sporting Goods, Bikes, Shopping]
2                                                       []
3                                          [Food, Grocery]
4                       [Food, Bagels, Delis, Restaurants]
                               ...                        
11532                               [Mexican, Restaurants]
11533                               [Mexican, Restaurants]
11534                                      [Food, Grocery]
11535                  [Greek, Mediterranean, Restaurants]
11536                            [Print Media, Mass Media]
Name: categories, Length: 11537, dtype: object

In [107]:
# Pull out only Nightlife and Restaurants businesses
two_biz = biz_df[biz_df.apply(lambda x: 'Nightlife' in x['categories'] or
'Restaurants' in x['categories'], axis=1)]

# Join with the reviews to get all reviews on the two types of business
two_biz_reviews = two_biz.merge(review_df, on='business_id', how='inner')

# Trim away the features we won't use
two_biz_reviews = two_biz_reviews[['business_id',
                                    'text',
                                    'categories']]

# Create the target column--True for Nightlife businesses, and False otherwise
two_biz_reviews['target'] = two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)

In [108]:
print(two_biz_reviews.target.value_counts())
# two_biz_reviews.head()

False    135902
True      30136
Name: target, dtype: int64


In [109]:
nightlife = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)]
restaurants = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Restaurants' in x['categories'], axis=1)]
print(f'nightlife : {nightlife.shape} \nrestaurants : {restaurants.shape}')

nightlife : (30136, 4) 
restaurants : (158430, 4)


In [110]:
(30136*.9)/158430

0.17119484946032948

In [112]:
nightlife_subset = nightlife.sample(frac=.9, random_state=123)
restaurant_subset = restaurants.sample(frac=0.1711, random_state=123)

print(nightlife_subset.shape, restaurant_subset.shape)
combined = pd.concat([nightlife_subset, restaurant_subset])
combined.shape

(27122, 4) (27107, 4)


(54229, 4)

### Data Split

In [113]:
from sklearn.model_selection import train_test_split

# Split into training and test datasets
training_data, test_data = train_test_split(combined, train_size=0.7, random_state=123)
training_data.shape, test_data.shape

((37960, 4), (16269, 4))

### Data Representation

In [114]:
# Represent the review text as a bag-of-words
bow_transform = CountVectorizer()
X_tr_bow = bow_transform.fit_transform(training_data['text'])
X_te_bow = bow_transform.transform(test_data['text'])

print(len(bow_transform.vocabulary_))

# target data
y_tr = training_data['target']
y_te = test_data['target']


# Create the tf-idf representation using the bag-of-words matrix
tfidf_trfm = TfidfTransformer(norm=None)
X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_trfm.transform(X_te_bow)

48504


### Applying Machine Mearning

In [120]:
from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
    ### Helper function to train a logistic classifier and score on testdata
    m = LogisticRegression().fit(X_tr, y_tr)
    s = m.score(X_test, y_test)
    print ('Test score with', description, 'features:', s)
    return m

m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tfidf')


Test score with bow features: 0.8241440776937734
Test score with tfidf features: 0.8127112914131169


### Fine-Tuning

In [121]:
import sklearn.model_selection as model

# Specify a search grid, then do a 5-fold grid search for each of the feature sets
# 1e5 = 100000
prams =  [1e-5, 1e-4, 1e-3, 1e-1, 1e2]
param_grid_ = {'C': prams}

# Tune classifier for bag-of-words representation
print('using BoW:')
bow_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
bow_search.fit(X_tr_bow, y_tr)


# Tune classifier for tf-idf
print('using tf-idf:')
tfidf_search = model.GridSearchCV(LogisticRegression(), cv=5, param_grid=param_grid_, verbose=2, n_jobs=3)
tfidf_search.fit(X_tr_tfidf, y_tr)

using BoW:
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:   31.0s finished


using tf-idf:
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  25 out of  25 | elapsed:   29.6s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=3,
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.1, 100.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [122]:
search_results = pd.DataFrame.from_dict({'bow': bow_search.cv_results_['mean_test_score'], 
                                         'tfidf': tfidf_search.cv_results_['mean_test_score'],
                                        'Inverse of regularization strength (C)': prams})
search_results

Unnamed: 0,bow,tfidf,Inverse of regularization strength (C)
0,0.574737,0.706507,1e-05
1,0.678741,0.810774,0.0001
2,0.782929,0.83098,0.001
3,0.825922,0.819547,0.1
4,0.81628,0.800922,100.0
