# Classification Experiment: Mentions
---
This Notebook, includes a series of experiments, on using a node's Friends for classification.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib
import time 

In [12]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=60*5)

In [3]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

# NLP Functions
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

In [14]:
def get_text_data_hotel_nd(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata

def mentions_hotel_count(training_set):
    model_nd = joblib.load('../classifiers/classifier_hotel_nd.sav')
    counts = []
    mentions = training_set['recent_100_statuses'].str.findall(r'@\w+')
    
    for accs in tqdm(mentions):
        print(f'\nNode: {len(accs)}')
        users = []
        count = 0
        
        try:
            for acc in accs:
                users.append(API.get_user(screen_name=acc))
        except tweepy.TweepError as err:
            print(f'Error get_user: {err}')
            
    
        textdata = pd.DataFrame()         
        for user in users:
            textdata = textdata.append({'name':user.name, 'description':user.description}, ignore_index=True)
    
        if not textdata.empty:
            #Predict
            labels = model_nd.predict(textdata)
            textdata['labels'] = labels
                  
            #COUNT
            count = len(textdata[textdata['labels'] == 1])
        
        counts.append(count)
        print(f'Percentage: {count}')
                  
    return counts

## Calculate Hotel Followers Count

### Validation Set

In [5]:
validation_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-validation-set-enhanced.csv')
validation_set = validation_set.replace(np.nan, '')
validation_set.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel,friends_hotel_count_1000,followers_hotel_count_1000
0,SophiaSuites,Sophia Collection Santorini,"Luxury Suites, hotels and villas Santorini com...",513,41,127,"""The tans will fade but the memories will las...",1,4,15
1,AnthiMariaApart,AnthiMariaApartments,Anthi Maria Beach Apartments is a self-caterin...,102,25,110,Our fantastic New and Improved abc online web...,1,5,12
2,wEndowproject,wEndow project,WEndow Escape Resort & Villas | Tailor-made Ad...,350,344,103,https://t.co/DHuXrG8G6o For those who still d...,1,33,14
3,paphotels,paphotels,"The best of Greek hospitality! Follow us, visi...",975,1182,475,@AlbertBourla 💯💯💯💯💯 Happy Easter !!!🐣 @paphot...,1,67,58
4,medpalace,Mediterranean Palace,A cozy 5 star hotel in the city center with an...,269,543,381,https://t.co/WPCR6KSnw2 New era!\nNew Brand! ...,1,49,23


In [15]:
hotel_mentions = mentions_hotel_count(validation_set)

  0%|          | 0/50 [00:00<?, ?it/s]


Node: 25
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 1
Percentage: 0

Node: 22
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 14
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 102
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 2
Percentage: 0

Node: 5
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 115
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 0
Percentage: 0

Node: 16
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 101
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 2

Node: 55
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 20
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 8
Percentage: 0

Node: 0
P

In [17]:
validation_set['mention_hotel_count'] = hotel_mentions
validation_set.to_csv('../../../../datasets/Hotels/classification/hotels-validation-set-enhanced.csv', index=False)
del validation_set

### Training Set

In [18]:
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set-enhanced.csv')
training_set = training_set.replace(np.nan, '')
training_set.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel,friends_hotel_count_1000,followers_hotel_count_1000
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,1832,1569,2229,Summer vacation is meant to make you feel ⛱ r...,1,83,116
1,AquaVistaHotels,Aqua Vista Hotels,A compilation of extraordinary hotels catering...,5924,1650,2116,Thank you Greek Travel Pages for highlighting...,1,118,99
2,Eurobank_Group,Eurobank,Καλωσήρθατε στην επίσημη σελίδα της Eurobank σ...,3284,0,2691,"Η Eurobank ενημερώνει ότι τα συστήματά της, κ...",0,0,10
3,white_suites,White Suites Resort,White Suites Resort is a luxury beach hotel in...,2,93,18,"Sea side holidays in Afytos, Halikidiki White...",1,6,7
4,KarenMillen,Karen Millen,"Timeless, elevated ready-to-wear style for women.",10908,1409,35679,The future's bright.\nhttps://t.co/XLpskBYi4u...,0,30,10


In [19]:
hotel_mentions = mentions_hotel_count(training_set)

  0%|          | 0/200 [00:00<?, ?it/s]


Node: 11
Percentage: 4

Node: 3
Percentage: 0

Node: 3
Percentage: 0

Node: 2
Percentage: 2

Node: 107
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 2

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 1
Percentage: 0

Node: 5
Error get_user: [{'code': 63, 'message': 'User has been suspended.'}]
Percentage: 1

Node: 29
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 1
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 5
Percentage: 0

Node: 0
Percentage: 0

Node: 1
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 94
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 4
Percentage: 0

Node: 15
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 64


Rate limit reached. Sleeping for: 367


Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 1
Percentage: 1

Node: 124
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 5

Node: 21
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 32
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 30
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 57
Percentage: 6

Node: 105
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 26
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 7

Node: 17
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 2
Percentage: 0

Node: 2
Percentage: 0

Node: 8
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 48
Percentage: 0

Node: 11
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage

Rate limit reached. Sleeping for: 440


Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 2
Percentage: 0

Node: 1
Percentage: 0

Node: 0
Percentage: 0

Node: 4
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 2
Percentage: 0

Node: 0
Percentage: 0

Node: 4
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 114
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 69
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 5

Node: 20
Percentage: 0

Node: 9
Percentage: 0

Node: 13
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 4
Percentage: 4

Node: 5
Percentage: 0

Node: 3
Percentage: 1

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 63
Error get_user: [{'code': 63, 'message': 'User has been suspended.'}]
Percentage: 0

Node: 137
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 

Rate limit reached. Sleeping for: 414


Percentage: 2

Node: 2
Percentage: 2

Node: 48
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 7
Error get_user: [{'code': 63, 'message': 'User has been suspended.'}]
Percentage: 1

Node: 0
Percentage: 0

Node: 4
Percentage: 0

Node: 1
Percentage: 0

Node: 49
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 1

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 0
Percentage: 0

Node: 102
Error get_user: [{'code': 63, 'message': 'User has been suspended.'}]
Percentage: 0

Node: 15
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 0
Percentage: 0

Node: 3
Percentage: 0

Node: 26
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 0

Node: 1
Percentage: 0

Node: 34
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 9

Node: 35
Error get_user: [{'code': 50, 'message': 'User not found.'}]
Percentage: 7

Node: 32
Error get_user: [{'code': 50, 'message': 'User n

In [21]:
training_set['mention_hotel_count'] = hotel_mentions
training_set.to_csv('../../../../datasets/Hotels/classification/hotels-training-set-enhanced.csv', index=False)
del training_set

# Dataset
---

In [22]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set-enhanced.csv')
training_set = training_set.replace(np.nan, '')
training_set.tail()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel,friends_hotel_count_1000,followers_hotel_count_1000,mention_hotel_count
195,AlexanderHaus,Alexander Haus,"#Studio #Rooms to Let, in #Halkidiki, #Sithoni...",492,232,277,"Though is winter, summer is coming! https://t...",1,14,24,2
196,artsoundgr,ArtSound & Lights,Art Sound & Lights Professional Audio/Video Se...,443,497,191,Προσφορά STROBE 1500W DMX ARTLIGHT ST1500W μό...,0,13,5,0
197,JOHNMARYRHODES,JOHNMARY FALIRAKI,John Mary is a famly hotel and is located at F...,2,15,7,http://t.co/DNz6I3s0Sh http://t.co/9eqsMrL4MK,1,2,3,0
198,THEMETHOTEL,THE MET HOTEL,https://t.co/fi814NlnxK\r\nhttp://t.co/AlYUMI5...,2816,181,1136,Let the LOVE sparkle at The MET Hotel!!\n\n#T...,1,9,68,1
199,EvitaResort,SunConnect Evita,,25,24,37,Zumba time @evitaresort @evitaresort #sunconn...,1,1,3,12


# Only Mention Count

In [23]:
X = training_set
y = training_set.hotel

In [24]:
# Function to select the data
def get_data_(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)


get_data = FunctionTransformer(get_data_)


print('------------------ Support Vector Machine -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('svm', svm.SVC())
])

# Paramters for optimization
parameters = {'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('knn', KNeighborsClassifier())
])

# Paramters for optimization
parameters = {'knn__n_neighbors': [i for i in range(20)],
              'knn__weights': ['uniform', 'distance']}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

print('\n\n------------------ Logistic Regression -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('lr', LogisticRegression(max_iter=1000))
])

# Paramters for optimization
parameters = {'lr__penalty': ['l1', 'l2', 'elasticnet'],
              'lr__C': [0.1, 0.5, 1, 5, 10]}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

------------------ Support Vector Machine -------------------

 Best Params: {'svm__C': 1, 'svm__kernel': 'rbf'}.
 Score: 0.605


------------------ kNN -------------------



 0.48  0.495 0.485 0.495 0.565 0.58  0.58  0.58  0.58  0.58  0.58  0.58
 0.575 0.58  0.5   0.495 0.495 0.495 0.5   0.495 0.49  0.495 0.5   0.495
 0.495 0.495 0.5   0.495]


 Best Params: {'knn__n_neighbors': 8, 'knn__weights': 'distance'}.
 Score: 0.5800000000000001


------------------ Logistic Regression -------------------

 Best Params: {'lr__C': 0.1, 'lr__penalty': 'l2'}.
 Score: 0.605


   nan 0.605   nan]


### Export Model

In [25]:
X = training_set
y = training_set.hotel

# Function to select the data
def get_data_(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)


get_data = FunctionTransformer(get_data_)

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('svm', svm.SVC(C=1, kernel='rbf'))
])

pipeline.fit(X,y)

Pipeline(steps=[('selector',
                 FunctionTransformer(func=<function get_data_ at 0x7fbb3e1923a0>)),
                ('svm', SVC(C=1))])

In [26]:
filename = '../classifiers/classifier_hotel_me.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_hotel_me.sav']

# Statuses and Mentions

## Without NLP

In [27]:
train = training_set.copy()
train['textdata'] = clean_text(train['recent_100_statuses'])
X = train
y = train.hotel

In [28]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)

def get_numeric_data_(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.785 0.545 0.815 0.73  0.77  0.545 0.815 0.745 0.795 0.535 0.565 0.55
 0.83  0.535 0.73  0.575 0.835 0.535 0.79  0.61  0.79  0.545 0.81  0.725
 0.8   0.545 0.835 0.72  0.795 0.535 0.56  0.555 0.82  0.535 0.785 0.57
 0.83  0.535 0.79  0.635 0.8   0.545 0.79  0.725 0.795 0.55  0.83  0.725
 0.775 0.54  0.55  0.55  0.81  0.535 0.64  0.565 0.805 0.535 0.755 0.565
 0.78  0.545 0.785 0.74  0.78  0.545 0.8   0.735 0.79  0.535 0.56  0.55
 0.825 0.535 0.74  0.585 0.84  0.535 0.775 0.605 0.805 0.545 0.805 0.73
 0.8   0.545 0.825 0.72  0.795 0.535 0.56  0.555 0.82  0.535 0.785 0.57
 0.83  0.535 0.79  0.635 0.8   0.545 0.79  0.725 0.795 0.55  0.83  0.725
 0.705 0.54  0.54  0.54  0.81  0.535 0.595 0.56  0.83  0.535 0.71  0.56
 0.805 0.545 0.785 0.73  0.795 0.545 0.805 0.745 0.79  0.535 0.56  0.55
 0.825 0.535 0.74  0.59  0.84  0.535 0.775 0.605 0.805 0.545 0.805 0.73
 0.8   0.545 0.825 0.72  0.795 0.535 0.56  0.555 0.82  0.535 0.785 0.57
 0.83  0.535 0.79  0.635 0.8   0.545 0.79  0.725 0.795 0.55 

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 2000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 1, 'svm__kernel': 'linear'}.
 Score: 0.8399999999999999


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.675 0.675 0.665 0.675 0.67  0.665 0.67  0.67  0.685 0.685 0.655 0.685
 0.705 0.705 0.675 0.695 0.695 0.695 0.695 0.7   0.685 0.685 0.685 0.69
 0.675 0.67  0.675 0.68  0.685 0.685 0.66  0.685 0.72  0.72  0.695 0.71
 0.71  0.72  0.715 0.72  0.705 0.71  0.71  0.715 0.71  0.705 0.705 0.71
 0.64  0.64  0.62  0.64  0.65  0.65  0.64  0.645 0.61  0.61  0.61  0.615
 0.585 0.585 0.575 0.58  0.58  0.575 0.575 0.57  0.665 0.665 0.64  0.665
 0.69  0.69  0.65  0.67  0.675 0.675 0.68  0.685 0.685 0.685 0.69  0.695
 0.675 0.67  0.67  0.675 0.685 0.685 0.66  0.685 0.72  0.72  0.695 0.71
 0.71  0.72  0.715 0.72  0.705 0.71  0.71  0.715 0.71  0.705 0.705 0.71
 0.535 0.535 0.53  0.535 0.535 0.535 0.555 0.55  0.505 0.505 0.495 0.495
 0.5   0.5   0.5   0.5   0.505 0.5   0.5   0.495 0.665 0.665 0.64  0.665
 0.69  0.69  0.66  0.68  0.68  0.68  0.685 0.69  0.685 0.685 0.69  0.695
 0.675 0.67  0.67  0.675 0.685 0.685 0.66  0.685 0.72  0.72  0.695 0.71
 0.71  0.72  0.715 0.72  0.705 0.71  0.71  0.715 0.71  0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 10, 'knn__n_neighbors': 3, 'knn__weights': 'uniform'}.
 Score: 0.7449999999999999


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.835   nan 0.84    nan 0.825   nan 0.84    nan 0.785   nan 0.825
   nan 0.795   nan 0.825   nan 0.815   nan 0.825   nan 0.83    nan 0.825
   nan 0.83    nan 0.825   nan 0.79    nan 0.85    nan 0.79    nan 0.85
   nan 0.81    nan 0.85    nan 0.835   nan 0.85    nan 0.83    nan 0.85
   nan 0.76    nan 0.825   nan 0.795   nan 0.825   nan 0.81    nan 0.825
   nan 0.825   nan 0.825   nan 0.82    nan 0.825   nan 0.79    nan 0.825
   nan 0.79    nan 0.825   nan 0.82    nan 0.825   nan 0.825   nan 0.825
   nan 0.825   nan 0.825   nan 0.79    nan 0.85    nan 0.79    nan 0.85
   nan 0.81    nan 0.85    nan 0.835   nan 0.85    nan 0.83    nan 0.85
   nan 0.74    nan 0.82    nan 0.8     nan 0.82    nan 0.805   nan 0.82
   nan 0.825   nan 0.82    nan 0.83    nan 0.82    nan 0.79    nan 0.825
   nan 0.79    nan 0.825   nan 0.82    nan 0.825   nan 0.825   nan 0.825
   nan 0.825   nan 0.825   nan 0.79    nan 0.85    nan 0.79    nan 0.85
   nan 0.81    nan 0.85    nan 0.835   nan 0.85    nan 0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 10, 'lr__C': 0.1, 'lr__penalty': 'none'}.
 Score: 0.85




## Stop Word Removal

In [29]:
train = training_set.copy()
train['textdata'] = clean_text(train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))

X = train
y = train.hotel

In [30]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)


get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.835 0.545 0.81  0.705 0.79  0.545 0.82  0.755 0.735 0.535 0.54  0.54
 0.855 0.535 0.61  0.565 0.85  0.535 0.74  0.56  0.825 0.545 0.825 0.715
 0.775 0.55  0.825 0.745 0.795 0.535 0.54  0.545 0.85  0.535 0.655 0.565
 0.85  0.535 0.775 0.575 0.805 0.55  0.83  0.725 0.795 0.55  0.82  0.745
 0.58  0.54  0.54  0.54  0.84  0.535 0.585 0.56  0.845 0.535 0.63  0.565
 0.84  0.545 0.815 0.69  0.79  0.545 0.825 0.73  0.705 0.535 0.54  0.54
 0.85  0.535 0.605 0.565 0.85  0.535 0.725 0.56  0.815 0.545 0.82  0.71
 0.775 0.545 0.825 0.74  0.795 0.535 0.54  0.545 0.85  0.535 0.655 0.565
 0.85  0.535 0.775 0.575 0.805 0.55  0.83  0.725 0.795 0.55  0.82  0.745
 0.565 0.54  0.54  0.54  0.855 0.535 0.575 0.56  0.84  0.535 0.625 0.56
 0.835 0.545 0.8   0.685 0.71  0.545 0.815 0.73  0.705 0.535 0.54  0.54
 0.85  0.535 0.605 0.565 0.85  0.535 0.725 0.56  0.815 0.545 0.82  0.71
 0.775 0.545 0.825 0.74  0.795 0.535 0.54  0.545 0.85  0.535 0.655 0.565
 0.85  0.535 0.775 0.575 0.805 0.55  0.83  0.725 0.795 0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 0.5, 'svm__kernel': 'linear'}.
 Score: 0.86


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.505 0.505 0.5   0.505 0.5   0.495 0.5   0.5   0.585 0.585 0.545 0.585
 0.505 0.51  0.465 0.485 0.53  0.53  0.53  0.535 0.5   0.5   0.5   0.505
 0.5   0.495 0.495 0.5   0.605 0.605 0.57  0.605 0.605 0.615 0.585 0.595
 0.565 0.58  0.58  0.58  0.525 0.535 0.535 0.54  0.525 0.53  0.52  0.525
 0.49  0.49  0.475 0.49  0.49  0.49  0.48  0.475 0.48  0.48  0.48  0.485
 0.495 0.495 0.485 0.49  0.5   0.495 0.5   0.495 0.525 0.525 0.485 0.525
 0.505 0.505 0.485 0.51  0.505 0.505 0.51  0.515 0.495 0.495 0.495 0.505
 0.5   0.495 0.495 0.5   0.605 0.605 0.57  0.605 0.605 0.615 0.585 0.595
 0.565 0.58  0.58  0.58  0.525 0.535 0.535 0.54  0.525 0.53  0.52  0.525
 0.5   0.5   0.51  0.5   0.515 0.515 0.49  0.505 0.505 0.505 0.5   0.5
 0.495 0.495 0.495 0.495 0.5   0.495 0.5   0.49  0.525 0.525 0.485 0.525
 0.505 0.505 0.485 0.51  0.505 0.505 0.51  0.515 0.495 0.495 0.495 0.505
 0.5   0.495 0.495 0.5   0.605 0.605 0.57  0.605 0.605 0.615 0.585 0.595
 0.565 0.58  0.58  0.58  0.525 0.535 0.535 0.54  0.52

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 10, 'knn__n_neighbors': 3, 'knn__weights': 'distance'}.
 Score: 0.615


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.845   nan 0.84    nan 0.84    nan 0.84    nan 0.765   nan 0.86
   nan 0.82    nan 0.86    nan 0.83    nan 0.86    nan 0.845   nan 0.86
   nan 0.845   nan 0.86    nan 0.785   nan 0.845   nan 0.82    nan 0.845
   nan 0.835   nan 0.845   nan 0.84    nan 0.845   nan 0.85    nan 0.845
   nan 0.725   nan 0.83    nan 0.815   nan 0.83    nan 0.83    nan 0.83
   nan 0.84    nan 0.83    nan 0.845   nan 0.83    nan 0.745   nan 0.87
   nan 0.82    nan 0.87    nan 0.83    nan 0.87    nan 0.845   nan 0.87
   nan 0.87    nan 0.87    nan 0.785   nan 0.845   nan 0.82    nan 0.845
   nan 0.835   nan 0.845   nan 0.84    nan 0.845   nan 0.85    nan 0.845
   nan 0.68    nan 0.835   nan 0.815   nan 0.835   nan 0.83    nan 0.835
   nan 0.825   nan 0.835   nan 0.835   nan 0.835   nan 0.745   nan 0.87
   nan 0.82    nan 0.87    nan 0.83    nan 0.87    nan 0.845   nan 0.87
   nan 0.87    nan 0.87    nan 0.785   nan 0.845   nan 0.82    nan 0.845
   nan 0.835   nan 0.845   nan 0.84    nan 0.845   nan 0.8

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 2000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 10, 'lr__penalty': 'l2'}.
 Score: 0.8800000000000001


## Lemmatization and Stop Word Removal

In [31]:
train = training_set.copy()
train['textdata'] = clean_text(train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
X = train
y = train.hotel

In [32]:
def get_text_data_(df):
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.825 0.545 0.83  0.71  0.77  0.545 0.825 0.75  0.755 0.535 0.54  0.54
 0.865 0.535 0.62  0.565 0.865 0.535 0.745 0.56  0.82  0.545 0.83  0.72
 0.76  0.55  0.84  0.745 0.8   0.535 0.55  0.55  0.865 0.535 0.655 0.565
 0.855 0.535 0.79  0.595 0.805 0.545 0.825 0.73  0.8   0.55  0.825 0.745
 0.59  0.54  0.54  0.54  0.845 0.535 0.585 0.56  0.85  0.535 0.645 0.565
 0.83  0.545 0.825 0.7   0.78  0.545 0.83  0.74  0.735 0.535 0.54  0.54
 0.855 0.535 0.61  0.565 0.865 0.535 0.74  0.56  0.825 0.545 0.825 0.715
 0.77  0.545 0.835 0.74  0.8   0.535 0.55  0.55  0.865 0.535 0.655 0.565
 0.855 0.535 0.79  0.595 0.805 0.545 0.825 0.73  0.8   0.55  0.825 0.745
 0.575 0.54  0.54  0.54  0.855 0.535 0.58  0.56  0.845 0.535 0.625 0.565
 0.825 0.545 0.82  0.68  0.7   0.545 0.82  0.75  0.735 0.535 0.54  0.54
 0.855 0.535 0.61  0.565 0.865 0.535 0.74  0.56  0.825 0.545 0.825 0.715
 0.77  0.545 0.835 0.74  0.8   0.535 0.55  0.55  0.865 0.535 0.655 0.565
 0.855 0.535 0.79  0.595 0.805 0.545 0.825 0.73  0.8   

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 0.5, 'svm__kernel': 'linear'}.
 Score: 0.865


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.485 0.485 0.49  0.495 0.5   0.495 0.49  0.49  0.575 0.575 0.555 0.575
 0.515 0.52  0.51  0.53  0.505 0.505 0.52  0.53  0.5   0.5   0.505 0.51
 0.5   0.495 0.5   0.5   0.57  0.57  0.555 0.57  0.575 0.585 0.57  0.585
 0.54  0.555 0.535 0.535 0.505 0.505 0.505 0.51  0.51  0.505 0.51  0.515
 0.49  0.49  0.48  0.49  0.485 0.485 0.475 0.47  0.5   0.5   0.485 0.49
 0.5   0.5   0.495 0.495 0.5   0.495 0.5   0.495 0.52  0.52  0.495 0.52
 0.49  0.49  0.495 0.5   0.485 0.485 0.49  0.495 0.49  0.49  0.49  0.5
 0.5   0.495 0.5   0.5   0.57  0.57  0.555 0.57  0.575 0.585 0.57  0.585
 0.54  0.555 0.535 0.535 0.505 0.505 0.505 0.51  0.51  0.505 0.51  0.515
 0.5   0.5   0.505 0.5   0.52  0.52  0.5   0.505 0.505 0.505 0.49  0.495
 0.5   0.5   0.495 0.495 0.5   0.495 0.5   0.49  0.52  0.52  0.495 0.52
 0.49  0.49  0.495 0.5   0.485 0.485 0.49  0.495 0.49  0.49  0.49  0.5
 0.5   0.495 0.5   0.5   0.57  0.57  0.555 0.57  0.575 0.585 0.57  0.585
 0.54  0.555 0.535 0.535 0.505 0.505 0.505 0.51  0.51  0.50

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 1, 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}.
 Score: 0.62


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.845   nan 0.845   nan 0.84    nan 0.845   nan 0.775   nan 0.885
   nan 0.83    nan 0.885   nan 0.845   nan 0.885   nan 0.845   nan 0.885
   nan 0.855   nan 0.885   nan 0.79    nan 0.875   nan 0.83    nan 0.875
   nan 0.845   nan 0.875   nan 0.84    nan 0.875   nan 0.86    nan 0.875
   nan 0.735   nan 0.84    nan 0.825   nan 0.84    nan 0.835   nan 0.84
   nan 0.84    nan 0.84    nan 0.84    nan 0.84    nan 0.77    nan 0.875
   nan 0.825   nan 0.875   nan 0.84    nan 0.875   nan 0.845   nan 0.875
   nan 0.855   nan 0.875   nan 0.79    nan 0.875   nan 0.83    nan 0.875
   nan 0.845   nan 0.875   nan 0.84    nan 0.875   nan 0.86    nan 0.875
   nan 0.71    nan 0.825   nan 0.83    nan 0.825   nan 0.83    nan 0.825
   nan 0.835   nan 0.825   nan 0.845   nan 0.825   nan 0.77    nan 0.875
   nan 0.825   nan 0.875   nan 0.84    nan 0.875   nan 0.845   nan 0.875
   nan 0.855   nan 0.875   nan 0.79    nan 0.875   nan 0.83    nan 0.875
   nan 0.845   nan 0.875   nan 0.84    nan 0.875   n

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 0.1, 'lr__penalty': 'none'}.
 Score: 0.885




### Export Model

In [34]:
X = training_set
y = training_set.hotel


def get_text_data_hotel_tme(df):
    df = df.copy()
    df['textdata'] = clean_text(df['recent_100_statuses'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_hotel_tme)


def get_numeric_data_hotel_tme(df):
    data = df['mention_hotel_count'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_hotel_tme)


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=1000, min_df=5)),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000, penalty='none'))
])


pipeline.fit(X, y)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector_num',
                                                                  FunctionTransformer(func=<function get_numeric_data_hotel_tme at 0x7fbb38dbf670>))])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector_text',
                                                                  FunctionTransformer(func=<function get_text_data_hotel_tme at 0x7fbb36975790>)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(max_df=0.5,
                                                                                  max_features=1000,
                                                                                  min_df=5))]))]

In [35]:
filename = 'classifier_hotel_tme.sav'
joblib.dump(pipeline, filename)

['classifier_hotel_tme.sav']

# Name Description Tweets and Counts

## Without NLP

In [36]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
X = train
y = train.hotel

In [37]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)

def get_numeric_data_(df):
    data = df[['friends_hotel_count_1000', 'followers_hotel_count_1000', 'mention_hotel_count']].to_numpy()
    return data

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.82  0.68  0.69  0.575 0.805 0.675 0.69  0.575 0.745 0.65  0.69  0.635
 0.88  0.67  0.69  0.5   0.88  0.67  0.69  0.5   0.845 0.68  0.69  0.575
 0.81  0.67  0.69  0.575 0.765 0.65  0.69  0.635 0.87  0.67  0.69  0.5
 0.87  0.67  0.69  0.5   0.83  0.68  0.69  0.575 0.81  0.67  0.69  0.575
 0.725 0.65  0.69  0.635 0.865 0.67  0.69  0.5   0.865 0.67  0.69  0.5
 0.825 0.68  0.69  0.575 0.805 0.675 0.69  0.575 0.74  0.65  0.69  0.635
 0.87  0.67  0.69  0.5   0.88  0.67  0.69  0.5   0.845 0.68  0.69  0.575
 0.835 0.67  0.69  0.575 0.765 0.65  0.69  0.635 0.87  0.67  0.69  0.5
 0.87  0.67  0.69  0.5   0.83  0.68  0.69  0.575 0.81  0.67  0.69  0.575
 0.705 0.65  0.69  0.635 0.86  0.67  0.69  0.5   0.875 0.67  0.69  0.5
 0.87  0.68  0.69  0.575 0.87  0.675 0.69  0.575 0.74  0.65  0.69  0.635
 0.87  0.67  0.69  0.5   0.875 0.67  0.69  0.5   0.85  0.68  0.69  0.575
 0.835 0.67  0.69  0.575 0.765 0.65  0.69  0.635 0.87  0.67  0.69  0.5
 0.87  0.67  0.69  0.5   0.83  0.68  0.69  0.575 0.81  0.67  

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 5, 'svm__kernel': 'linear'}.
 Score: 0.8800000000000001


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.735 0.735 0.765 0.74  0.76  0.755 0.765 0.74  0.645 0.645 0.675 0.68
 0.745 0.73  0.71  0.74  0.735 0.74  0.745 0.745 0.75  0.75  0.765 0.745
 0.76  0.755 0.77  0.74  0.64  0.64  0.675 0.68  0.745 0.73  0.715 0.745
 0.745 0.75  0.75  0.75  0.75  0.75  0.765 0.76  0.76  0.755 0.77  0.745
 0.635 0.635 0.675 0.67  0.74  0.725 0.71  0.73  0.73  0.735 0.755 0.73
 0.755 0.755 0.765 0.745 0.76  0.755 0.77  0.74  0.645 0.645 0.68  0.68
 0.745 0.73  0.71  0.745 0.74  0.745 0.755 0.74  0.76  0.76  0.77  0.755
 0.76  0.755 0.77  0.74  0.64  0.64  0.675 0.68  0.745 0.73  0.715 0.745
 0.745 0.75  0.75  0.75  0.75  0.75  0.765 0.76  0.76  0.755 0.77  0.745
 0.67  0.67  0.665 0.675 0.74  0.73  0.715 0.73  0.75  0.76  0.75  0.74
 0.76  0.76  0.775 0.76  0.77  0.77  0.77  0.755 0.645 0.645 0.68  0.68
 0.74  0.725 0.715 0.74  0.735 0.74  0.755 0.74  0.76  0.76  0.77  0.755
 0.76  0.755 0.77  0.74  0.64  0.64  0.675 0.68  0.745 0.73  0.715 0.745
 0.745 0.75  0.75  0.75  0.75  0.75  0.765 0.76  0.76  0

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'knn__n_neighbors': 10, 'knn__weights': 'uniform'}.
 Score: 0.78


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.855   nan 0.85    nan 0.86    nan 0.85    nan 0.735   nan 0.86
   nan 0.825   nan 0.86    nan 0.875   nan 0.86    nan 0.87    nan 0.86
   nan 0.865   nan 0.86    nan 0.735   nan 0.855   nan 0.825   nan 0.855
   nan 0.865   nan 0.855   nan 0.86    nan 0.855   nan 0.855   nan 0.855
   nan 0.725   nan 0.83    nan 0.8     nan 0.83    nan 0.84    nan 0.83
   nan 0.87    nan 0.83    nan 0.855   nan 0.83    nan 0.735   nan 0.855
   nan 0.815   nan 0.855   nan 0.86    nan 0.855   nan 0.865   nan 0.855
   nan 0.86    nan 0.855   nan 0.735   nan 0.855   nan 0.825   nan 0.855
   nan 0.865   nan 0.855   nan 0.86    nan 0.855   nan 0.855   nan 0.855
   nan 0.705   nan 0.835   nan 0.77    nan 0.835   nan 0.805   nan 0.835
   nan 0.875   nan 0.835   nan 0.865   nan 0.835   nan 0.735   nan 0.865
   nan 0.815   nan 0.865   nan 0.86    nan 0.865   nan 0.865   nan 0.865
   nan 0.86    nan 0.865   nan 0.735   nan 0.855   nan 0.825   nan 0.855
   nan 0.865   nan 0.855   nan 0.86    nan 0.855   nan

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 1, 'lr__penalty': 'l2'}.
 Score: 0.875


## Stop Word Removal

In [38]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))

X = train
y = train.hotel

In [39]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df[['friends_hotel_count_1000', 'followers_hotel_count_1000', 'mention_hotel_count']].to_numpy()
    return data


get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.865 0.68  0.69  0.575 0.845 0.675 0.69  0.575 0.735 0.65  0.69  0.635
 0.885 0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.86  0.68  0.69  0.575
 0.83  0.675 0.69  0.575 0.755 0.65  0.69  0.635 0.88  0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.855 0.68  0.69  0.575 0.835 0.675 0.69  0.575
 0.73  0.65  0.69  0.635 0.85  0.67  0.69  0.5   0.865 0.67  0.69  0.5
 0.865 0.68  0.69  0.575 0.85  0.675 0.69  0.575 0.735 0.65  0.69  0.635
 0.87  0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.87  0.68  0.69  0.495
 0.845 0.675 0.69  0.495 0.755 0.65  0.69  0.635 0.88  0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.855 0.68  0.69  0.575 0.835 0.675 0.69  0.575
 0.7   0.65  0.69  0.635 0.85  0.67  0.69  0.5   0.87  0.67  0.69  0.5
 0.86  0.68  0.69  0.495 0.86  0.675 0.69  0.495 0.735 0.65  0.69  0.635
 0.87  0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.87  0.68  0.69  0.495
 0.845 0.675 0.69  0.495 0.755 0.65  0.69  0.635 0.88  0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.855 0.68  0.69  0.575 0.835 0.675 

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 1, 'svm__kernel': 'linear'}.
 Score: 0.89


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.745 0.745 0.765 0.75  0.755 0.745 0.77  0.745 0.635 0.635 0.67  0.67
 0.73  0.715 0.715 0.745 0.735 0.74  0.75  0.73  0.75  0.75  0.77  0.75
 0.765 0.755 0.765 0.74  0.645 0.645 0.67  0.68  0.735 0.72  0.715 0.74
 0.73  0.735 0.75  0.73  0.745 0.745 0.765 0.745 0.75  0.74  0.77  0.745
 0.635 0.635 0.67  0.675 0.72  0.705 0.7   0.72  0.735 0.74  0.755 0.73
 0.755 0.755 0.76  0.755 0.76  0.75  0.765 0.745 0.63  0.63  0.675 0.66
 0.74  0.725 0.715 0.745 0.74  0.745 0.755 0.735 0.755 0.755 0.765 0.755
 0.76  0.75  0.765 0.74  0.645 0.645 0.67  0.68  0.735 0.72  0.715 0.74
 0.73  0.735 0.75  0.73  0.745 0.745 0.765 0.745 0.75  0.74  0.77  0.745
 0.67  0.67  0.68  0.68  0.735 0.725 0.71  0.725 0.745 0.755 0.75  0.74
 0.75  0.755 0.765 0.75  0.76  0.755 0.77  0.745 0.63  0.63  0.675 0.66
 0.74  0.725 0.715 0.745 0.74  0.745 0.755 0.735 0.755 0.755 0.765 0.755
 0.76  0.75  0.765 0.74  0.645 0.645 0.67  0.68  0.735 0.72  0.715 0.74
 0.73  0.735 0.75  0.73  0.745 0.745 0.765 0.745 0.75  0.74 

 Best Params: {'features__text_features__vectorizer__max_df': 1, 'features__text_features__vectorizer__max_features': None, 'features__text_features__vectorizer__min_df': 1, 'knn__n_neighbors': 10, 'knn__weights': 'uniform'}.
 Score: 0.775


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.875   nan 0.815   nan 0.87    nan 0.815   nan 0.725   nan 0.875
   nan 0.82    nan 0.875   nan 0.87    nan 0.875   nan 0.88    nan 0.875
   nan 0.885   nan 0.875   nan 0.735   nan 0.87    nan 0.85    nan 0.87
   nan 0.865   nan 0.87    nan 0.885   nan 0.87    nan 0.885   nan 0.87
   nan 0.715   nan 0.845   nan 0.78    nan 0.845   nan 0.835   nan 0.845
   nan 0.865   nan 0.845   nan 0.86    nan 0.845   nan 0.725   nan 0.885
   nan 0.815   nan 0.885   nan 0.87    nan 0.885   nan 0.875   nan 0.885
   nan 0.885   nan 0.885   nan 0.735   nan 0.87    nan 0.85    nan 0.87
   nan 0.865   nan 0.87    nan 0.885   nan 0.87    nan 0.885   nan 0.87
   nan 0.705   nan 0.855   nan 0.75    nan 0.855   nan 0.81    nan 0.855
   nan 0.865   nan 0.855   nan 0.865   nan 0.855   nan 0.725   nan 0.885
   nan 0.815   nan 0.885   nan 0.87    nan 0.885   nan 0.875   nan 0.885
   nan 0.885   nan 0.885   nan 0.735   nan 0.87    nan 0.85    nan 0.87
   nan 0.865   nan 0.87    nan 0.885   nan 0.87    nan 0

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 10, 'lr__penalty': 'l2'}.
 Score: 0.885


## Lemmatization and Stop Word Removal

In [40]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
X = train
y = train.hotel

In [41]:
def get_text_data_(df):
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df[['friends_hotel_count_1000', 'followers_hotel_count_1000', 'mention_hotel_count']].to_numpy()
    return data

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.855 0.68  0.69  0.575 0.835 0.675 0.69  0.575 0.75  0.65  0.69  0.635
 0.89  0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.855 0.68  0.69  0.575
 0.845 0.675 0.69  0.575 0.765 0.65  0.69  0.635 0.885 0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.86  0.68  0.69  0.575 0.84  0.675 0.69  0.575
 0.735 0.65  0.69  0.635 0.865 0.67  0.69  0.5   0.865 0.67  0.69  0.5
 0.865 0.68  0.69  0.575 0.865 0.675 0.69  0.575 0.745 0.65  0.69  0.635
 0.875 0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.855 0.68  0.69  0.575
 0.84  0.675 0.69  0.575 0.765 0.65  0.69  0.635 0.885 0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.86  0.68  0.69  0.575 0.84  0.675 0.69  0.575
 0.71  0.65  0.69  0.635 0.86  0.67  0.69  0.5   0.865 0.67  0.69  0.5
 0.865 0.68  0.69  0.495 0.865 0.675 0.69  0.495 0.745 0.65  0.69  0.635
 0.875 0.67  0.69  0.5   0.89  0.67  0.69  0.5   0.855 0.68  0.69  0.575
 0.84  0.675 0.69  0.575 0.765 0.65  0.69  0.635 0.885 0.67  0.69  0.5
 0.885 0.67  0.69  0.5   0.86  0.68  0.69  0.575 0.84  0.675 

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 0.5, 'svm__kernel': 'linear'}.
 Score: 0.89


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.745 0.745 0.76  0.745 0.76  0.75  0.77  0.74  0.635 0.635 0.675 0.675
 0.745 0.73  0.71  0.74  0.74  0.745 0.755 0.74  0.75  0.75  0.76  0.745
 0.76  0.75  0.77  0.74  0.65  0.65  0.665 0.68  0.735 0.72  0.71  0.73
 0.73  0.735 0.75  0.73  0.75  0.75  0.76  0.745 0.76  0.75  0.775 0.755
 0.635 0.635 0.67  0.67  0.73  0.715 0.71  0.73  0.73  0.735 0.75  0.725
 0.75  0.75  0.765 0.745 0.755 0.745 0.77  0.735 0.64  0.64  0.68  0.68
 0.745 0.73  0.71  0.735 0.73  0.735 0.75  0.73  0.75  0.75  0.765 0.75
 0.755 0.745 0.77  0.74  0.65  0.65  0.665 0.68  0.735 0.72  0.71  0.73
 0.73  0.735 0.75  0.73  0.75  0.75  0.76  0.745 0.76  0.75  0.775 0.755
 0.655 0.655 0.675 0.665 0.745 0.735 0.72  0.73  0.745 0.755 0.75  0.74
 0.75  0.755 0.765 0.75  0.755 0.75  0.77  0.745 0.64  0.64  0.68  0.68
 0.745 0.73  0.71  0.735 0.73  0.735 0.75  0.73  0.75  0.75  0.765 0.75
 0.755 0.745 0.77  0.74  0.65  0.65  0.665 0.68  0.735 0.72  0.71  0.73
 0.73  0.735 0.75  0.73  0.75  0.75  0.76  0.745 0.76  0.75

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 10, 'knn__n_neighbors': 10, 'knn__weights': 'uniform'}.
 Score: 0.775


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.885   nan 0.815   nan 0.88    nan 0.815   nan 0.725   nan 0.85
   nan 0.825   nan 0.85    nan 0.865   nan 0.85    nan 0.88    nan 0.85
   nan 0.89    nan 0.85    nan 0.735   nan 0.85    nan 0.845   nan 0.85
   nan 0.87    nan 0.85    nan 0.875   nan 0.85    nan 0.88    nan 0.85
   nan 0.72    nan 0.805   nan 0.8     nan 0.805   nan 0.83    nan 0.805
   nan 0.87    nan 0.805   nan 0.88    nan 0.805   nan 0.725   nan 0.86
   nan 0.825   nan 0.86    nan 0.865   nan 0.86    nan 0.88    nan 0.86
   nan 0.88    nan 0.86    nan 0.735   nan 0.85    nan 0.845   nan 0.85
   nan 0.87    nan 0.85    nan 0.875   nan 0.85    nan 0.88    nan 0.85
   nan 0.705   nan 0.865   nan 0.765   nan 0.865   nan 0.82    nan 0.865
   nan 0.865   nan 0.865   nan 0.87    nan 0.865   nan 0.725   nan 0.86
   nan 0.825   nan 0.86    nan 0.865   nan 0.86    nan 0.88    nan 0.86
   nan 0.88    nan 0.86    nan 0.735   nan 0.85    nan 0.845   nan 0.85
   nan 0.87    nan 0.85    nan 0.875   nan 0.85    nan 0.88   

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 10, 'lr__penalty': 'l2'}.
 Score: 0.89


### Export Model

In [43]:
X = training_set
y = training_set.hotel


def get_text_data_(df):
    df = df.copy()
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'] + ' ' + df['recent_100_statuses'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df[['friends_hotel_count_1000', 'followers_hotel_count_1000', 'mention_hotel_count']].to_numpy()
    return data

get_numeric_data = FunctionTransformer(get_numeric_data_)


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer(max_df=0.5, max_features=1000, min_df=5)),
            ]))
         ])),
     ('svm', svm.SVC(C=0.5, kernel='linear'))
])


pipeline.fit(X, y)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector_num',
                                                                  FunctionTransformer(func=<function get_numeric_data_ at 0x7fbb3c8e9550>))])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector_text',
                                                                  FunctionTransformer(func=<function get_text_data_ at 0x7fbb3c8e94c0>)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(max_df=0.5,
                                                                                  max_features=1000,
                                                                                  min_df=5))]))])),
              

In [44]:
filename = 'classifier_hotel_ndtfrfome.sav'
joblib.dump(pipeline, filename)

['classifier_hotel_ndtfrfome.sav']