# Classification Experiment: Friends
---
This Notebook, includes a series of experiments, on using a node's Friends for classification.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib
import time 

In [2]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=60*5)

In [3]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

# NLP Functions
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

In [4]:
def get_text_data_hotel_nd(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata

def fetch_friends(node, count=1000):
    # Fetch friend IDs
    friend_ids = []
    try:
        for ids in tweepy.Cursor(API.friends_ids, node).items(count):
            friend_ids.append(ids)

    except tweepy.error.TweepError as err:
        return pd.DataFrame(columns=['name', 'description'])

    except Exception as err:
        raise Exception(f'An unknown Error has occurred.\n{err}')

    # If node has zero friends
    if not friend_ids:
        return pd.DataFrame(columns=['name', 'description'])

    # Calculate Iteration Required, to iterate per 100 ids
    if (int(len(friend_ids)) % 100) == 0:
        it_num = int(len(friend_ids) / 100)
    else:
        it_num = (int(len(friend_ids) / 100) + 1)

    # Transform IDs to User Objects
    users = list()
    try:
        for i in range(it_num):
            users.append(API.lookup_users(friend_ids[100 * i: 100 * (1 + i)]))
    except Exception as err:
        raise Exception(f'An unknown Error has occurred.\n{err}')

    # Extract Profile Name and Description for each friends and save it to a DataFrame
    results = pd.DataFrame()
    for items in users:
        for user in items:
            results = results.append(pd.DataFrame([user.name, user.description]).T)
    results.columns = ['name', 'description']
    results = results.reset_index().drop('index', axis=1)

    return results

def calculate_hotel_friends(nodes):

    counts = []
    for node in tqdm(nodes, leave=False):
        # Get Required Data
        data = fetch_friends(node=node, count=1000)

        # Get Labels
        if not data.empty:
            model_nd = joblib.load('../classifiers/classifier_hotel_nd.sav')
            data['label'] = model_nd.predict(data)

            count = len(data[data['label'] == 1])
        else:
            count = 0

        counts.append(count)

    return counts

## Calculate Hotel Friends Count

### Validation Set

In [21]:
validation_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-validation-set.csv')
validation_set = validation_set.replace(np.nan, '')
validation_set.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel
0,SophiaSuites,Sophia Collection Santorini,"Luxury Suites, hotels and villas Santorini com...",513,41,127,"""The tans will fade but the memories will las...",1
1,AnthiMariaApart,AnthiMariaApartments,Anthi Maria Beach Apartments is a self-caterin...,102,25,110,Our fantastic New and Improved abc online web...,1
2,wEndowproject,wEndow project,WEndow Escape Resort & Villas | Tailor-made Ad...,350,344,103,https://t.co/DHuXrG8G6o For those who still d...,1
3,paphotels,paphotels,"The best of Greek hospitality! Follow us, visi...",975,1182,475,@AlbertBourla 💯💯💯💯💯 Happy Easter !!!🐣 @paphot...,1
4,medpalace,Mediterranean Palace,A cozy 5 star hotel in the city center with an...,269,543,381,https://t.co/WPCR6KSnw2 New era!\nNew Brand! ...,1


In [22]:
hotel_friends = calculate_hotel_friends(validation_set.screen_name)

  0%|          | 0/50 [00:00<?, ?it/s]

Rate limit reached. Sleeping for: 650
Rate limit reached. Sleeping for: 760
Rate limit reached. Sleeping for: 792
Rate limit reached. Sleeping for: 736


In [26]:
validation_set['friends_hotel_count_1000'] = hotel_friends
validation_set.to_csv('../../../../datasets/Hotels/classification/hotels-validation-set-enhanced.csv', index=False)
del validation_set

### Training Set

In [28]:
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set.csv')
training_set = training_set.replace(np.nan, '')
training_set.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,1832,1569,2229,Summer vacation is meant to make you feel ⛱ r...,1
1,AquaVistaHotels,Aqua Vista Hotels,A compilation of extraordinary hotels catering...,5924,1650,2116,Thank you Greek Travel Pages for highlighting...,1
2,Eurobank_Group,Eurobank,Καλωσήρθατε στην επίσημη σελίδα της Eurobank σ...,3284,0,2691,"Η Eurobank ενημερώνει ότι τα συστήματά της, κ...",0
3,white_suites,White Suites Resort,White Suites Resort is a luxury beach hotel in...,2,93,18,"Sea side holidays in Afytos, Halikidiki White...",1
4,KarenMillen,Karen Millen,"Timeless, elevated ready-to-wear style for women.",10908,1409,35679,The future's bright.\nhttps://t.co/XLpskBYi4u...,0


In [29]:
hotel_friends = calculate_hotel_friends(training_set.screen_name)

  0%|          | 0/200 [00:00<?, ?it/s]

Rate limit reached. Sleeping for: 457
Rate limit reached. Sleeping for: 604
Rate limit reached. Sleeping for: 674
Rate limit reached. Sleeping for: 730
Rate limit reached. Sleeping for: 645
Rate limit reached. Sleeping for: 759
Rate limit reached. Sleeping for: 693
Rate limit reached. Sleeping for: 756
Rate limit reached. Sleeping for: 795
Rate limit reached. Sleeping for: 743
Rate limit reached. Sleeping for: 733
Rate limit reached. Sleeping for: 711
Rate limit reached. Sleeping for: 673


In [31]:
training_set['friends_hotel_count_1000'] = hotel_friends
training_set.to_csv('../../../../datasets/Hotels/classification/hotels-training-set-enhanced.csv', index=False)
del training_set

# Dataset
---

In [6]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set-enhanced.csv')
training_set = training_set.replace(np.nan, '')
training_set.tail()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,recent_100_statuses,hotel,friends_hotel_count_1000
195,AlexanderHaus,Alexander Haus,"#Studio #Rooms to Let, in #Halkidiki, #Sithoni...",492,232,277,"Though is winter, summer is coming! https://t...",1,14
196,artsoundgr,ArtSound & Lights,Art Sound & Lights Professional Audio/Video Se...,443,497,191,Προσφορά STROBE 1500W DMX ARTLIGHT ST1500W μό...,0,13
197,JOHNMARYRHODES,JOHNMARY FALIRAKI,John Mary is a famly hotel and is located at F...,2,15,7,http://t.co/DNz6I3s0Sh http://t.co/9eqsMrL4MK,1,2
198,THEMETHOTEL,THE MET HOTEL,https://t.co/fi814NlnxK\r\nhttp://t.co/AlYUMI5...,2816,181,1136,Let the LOVE sparkle at The MET Hotel!!\n\n#T...,1,9
199,EvitaResort,SunConnect Evita,,25,24,37,Zumba time @evitaresort @evitaresort #sunconn...,1,1


# Only Counts

In [7]:
X = training_set
y = training_set.hotel

In [8]:
# Function to select the data
def get_data_hotel_friends_(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)


get_data = FunctionTransformer(get_data_hotel_friends_)


print('------------------ Support Vector Machine -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('svm', svm.SVC())
])

# Paramters for optimization
parameters = {'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('knn', KNeighborsClassifier())
])

# Paramters for optimization
parameters = {'knn__n_neighbors': [i for i in range(20)],
              'knn__weights': ['uniform', 'distance']}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

print('\n\n------------------ Logistic Regression -------------------\n')

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('lr', LogisticRegression(max_iter=1000))
])

# Paramters for optimization
parameters = {'lr__penalty': ['l1', 'l2', 'elasticnet'],
              'lr__C': [0.1, 0.5, 1, 5, 10]}

grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

------------------ Support Vector Machine -------------------

 Best Params: {'svm__C': 0.1, 'svm__kernel': 'sigmoid'}.
 Score: 0.6799999999999999


------------------ kNN -------------------



 0.635 0.62  0.64  0.62  0.655 0.63  0.62  0.62  0.615 0.63  0.62  0.625
 0.61  0.625 0.625 0.625 0.625 0.635 0.615 0.625 0.63  0.625 0.635 0.625
 0.63  0.625 0.655 0.625]


 Best Params: {'knn__n_neighbors': 8, 'knn__weights': 'uniform'}.
 Score: 0.655


------------------ Logistic Regression -------------------

 Best Params: {'lr__C': 0.1, 'lr__penalty': 'l2'}.
 Score: 0.65


  nan]


### Export Model

In [35]:
X = training_set
y = training_set.hotel

# Function to select the data
def get_data_hotel_fr(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)


get_data = FunctionTransformer(get_data_hotel_fr)

# The pipeline
pipeline = Pipeline([
    ('selector', get_data),
    ('svm', svm.SVC(C=0.1, kernel='sigmoid'))
])

pipeline.fit(X,y)

Pipeline(steps=[('selector',
                 FunctionTransformer(func=<function get_data_hotel_fr at 0x7fb269cd8d30>)),
                ('svm', SVC(C=0.1, kernel='sigmoid'))])

In [36]:
filename = 'classifier_hotel_fr.sav'
joblib.dump(pipeline, filename)

['classifier_hotel_fr.sav']

# Name Description Tweets and Counts

## Without NLP

In [9]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
X = train
y = train.hotel

In [10]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)

def get_numeric_data_(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.825 0.56  0.645 0.535 0.8   0.56  0.64  0.535 0.83  0.565 0.645 0.665
 0.895 0.56  0.65  0.565 0.88  0.56  0.66  0.535 0.855 0.565 0.645 0.535
 0.82  0.565 0.64  0.535 0.81  0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.885 0.56  0.66  0.535 0.845 0.56  0.645 0.535 0.815 0.565 0.64  0.535
 0.725 0.565 0.645 0.665 0.87  0.56  0.65  0.565 0.88  0.56  0.655 0.535
 0.835 0.56  0.645 0.535 0.79  0.56  0.64  0.535 0.81  0.565 0.645 0.665
 0.88  0.56  0.65  0.565 0.88  0.56  0.66  0.535 0.845 0.565 0.645 0.535
 0.825 0.56  0.64  0.535 0.81  0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.885 0.56  0.66  0.535 0.845 0.56  0.645 0.535 0.815 0.565 0.64  0.535
 0.69  0.565 0.645 0.665 0.88  0.56  0.65  0.565 0.885 0.56  0.655 0.535
 0.86  0.555 0.645 0.535 0.86  0.555 0.635 0.535 0.805 0.565 0.645 0.665
 0.885 0.56  0.65  0.565 0.88  0.56  0.66  0.535 0.84  0.565 0.645 0.535
 0.825 0.56  0.64  0.535 0.81  0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.885 0.56  0.66  0.535 0.845 0.56  0.645 0.535 0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'svm__C': 0.5, 'svm__kernel': 'linear'}.
 Score: 0.8949999999999999


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.7   0.705 0.685 0.69  0.675 0.7   0.685 0.69  0.66  0.66  0.595 0.66
 0.66  0.66  0.665 0.67  0.69  0.695 0.68  0.68  0.705 0.71  0.7   0.7
 0.675 0.7   0.68  0.7   0.67  0.67  0.61  0.67  0.675 0.67  0.675 0.69
 0.685 0.69  0.665 0.675 0.695 0.705 0.7   0.695 0.68  0.7   0.67  0.7
 0.62  0.62  0.555 0.62  0.615 0.615 0.66  0.62  0.675 0.685 0.67  0.67
 0.695 0.705 0.69  0.7   0.675 0.705 0.675 0.695 0.65  0.65  0.605 0.665
 0.665 0.665 0.665 0.665 0.685 0.69  0.685 0.685 0.705 0.71  0.695 0.705
 0.68  0.705 0.685 0.7   0.67  0.67  0.61  0.67  0.675 0.67  0.675 0.69
 0.685 0.69  0.665 0.675 0.695 0.705 0.7   0.695 0.68  0.7   0.67  0.7
 0.715 0.715 0.6   0.715 0.675 0.67  0.665 0.665 0.685 0.685 0.665 0.665
 0.7   0.7   0.675 0.665 0.67  0.675 0.67  0.675 0.65  0.65  0.605 0.665
 0.66  0.66  0.67  0.665 0.685 0.69  0.685 0.685 0.705 0.71  0.695 0.705
 0.68  0.705 0.685 0.7   0.67  0.67  0.61  0.67  0.675 0.67  0.675 0.69
 0.685 0.69  0.665 0.675 0.695 0.705 0.7   0.695 0.68  0.7   0

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 10, 'knn__n_neighbors': 7, 'knn__weights': 'distance'}.
 Score: 0.7449999999999999


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.845   nan 0.845   nan 0.865   nan 0.845   nan 0.76    nan 0.86
   nan 0.84    nan 0.86    nan 0.86    nan 0.86    nan 0.865   nan 0.86
   nan 0.875   nan 0.86    nan 0.77    nan 0.845   nan 0.835   nan 0.845
   nan 0.855   nan 0.845   nan 0.86    nan 0.845   nan 0.865   nan 0.845
   nan 0.74    nan 0.83    nan 0.835   nan 0.83    nan 0.835   nan 0.83
   nan 0.85    nan 0.83    nan 0.855   nan 0.83    nan 0.765   nan 0.86
   nan 0.835   nan 0.86    nan 0.86    nan 0.86    nan 0.86    nan 0.86
   nan 0.865   nan 0.86    nan 0.77    nan 0.845   nan 0.835   nan 0.845
   nan 0.855   nan 0.845   nan 0.86    nan 0.845   nan 0.865   nan 0.845
   nan 0.72    nan 0.825   nan 0.825   nan 0.825   nan 0.865   nan 0.825
   nan 0.865   nan 0.825   nan 0.87    nan 0.825   nan 0.765   nan 0.85
   nan 0.835   nan 0.85    nan 0.86    nan 0.85    nan 0.86    nan 0.85
   nan 0.865   nan 0.85    nan 0.77    nan 0.845   nan 0.835   nan 0.845
   nan 0.855   nan 0.845   nan 0.86    nan 0.845   nan 0.8

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 10, 'lr__penalty': 'l2'}.
 Score: 0.875


## Stop Word Removal

In [11]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))

X = train
y = train.hotel

In [12]:
def get_text_data_(df):
    
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.86  0.56  0.645 0.565 0.86  0.56  0.64  0.555 0.745 0.565 0.645 0.665
 0.89  0.56  0.65  0.565 0.89  0.56  0.655 0.535 0.875 0.565 0.645 0.535
 0.83  0.56  0.645 0.535 0.785 0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.875 0.56  0.655 0.535 0.85  0.56  0.645 0.535 0.84  0.56  0.645 0.535
 0.705 0.565 0.645 0.665 0.875 0.56  0.65  0.565 0.875 0.56  0.655 0.535
 0.865 0.56  0.645 0.535 0.865 0.56  0.64  0.535 0.735 0.565 0.645 0.665
 0.89  0.56  0.65  0.565 0.885 0.56  0.655 0.535 0.88  0.56  0.645 0.535
 0.85  0.56  0.645 0.535 0.785 0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.875 0.56  0.655 0.535 0.85  0.56  0.645 0.535 0.84  0.56  0.645 0.535
 0.68  0.565 0.645 0.665 0.865 0.56  0.65  0.575 0.87  0.56  0.655 0.57
 0.865 0.555 0.645 0.565 0.865 0.555 0.64  0.555 0.735 0.565 0.645 0.665
 0.89  0.56  0.65  0.565 0.885 0.56  0.655 0.535 0.88  0.56  0.645 0.535
 0.85  0.56  0.645 0.535 0.785 0.565 0.645 0.665 0.885 0.56  0.65  0.565
 0.875 0.56  0.655 0.535 0.85  0.56  0.645 0.535 0.8

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 1, 'svm__C': 1, 'svm__kernel': 'linear'}.
 Score: 0.8949999999999999


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.705 0.715 0.695 0.69  0.675 0.685 0.69  0.69  0.58  0.58  0.555 0.6
 0.605 0.6   0.67  0.62  0.685 0.685 0.68  0.655 0.71  0.72  0.705 0.7
 0.685 0.69  0.69  0.69  0.585 0.585 0.545 0.605 0.62  0.615 0.665 0.615
 0.68  0.68  0.665 0.655 0.715 0.715 0.705 0.695 0.69  0.695 0.685 0.695
 0.575 0.575 0.55  0.595 0.59  0.59  0.665 0.62  0.675 0.685 0.68  0.66
 0.72  0.735 0.7   0.695 0.68  0.69  0.7   0.69  0.615 0.615 0.565 0.635
 0.625 0.625 0.675 0.635 0.68  0.68  0.68  0.67  0.72  0.725 0.705 0.695
 0.69  0.695 0.695 0.7   0.585 0.585 0.545 0.605 0.62  0.615 0.665 0.615
 0.68  0.68  0.665 0.655 0.715 0.715 0.705 0.695 0.69  0.695 0.685 0.695
 0.71  0.71  0.595 0.705 0.665 0.66  0.675 0.675 0.705 0.705 0.695 0.68
 0.72  0.72  0.7   0.68  0.685 0.685 0.69  0.7   0.615 0.615 0.565 0.635
 0.625 0.625 0.675 0.635 0.68  0.68  0.68  0.67  0.72  0.725 0.705 0.695
 0.69  0.695 0.695 0.7   0.585 0.585 0.545 0.605 0.62  0.615 0.665 0.615
 0.68  0.68  0.665 0.655 0.715 0.715 0.705 0.695 0.69  0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 2000, 'features__text_features__vectorizer__min_df': 1, 'knn__n_neighbors': 7, 'knn__weights': 'distance'}.
 Score: 0.735


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.89    nan 0.88    nan 0.88    nan 0.88    nan 0.73    nan 0.89
   nan 0.855   nan 0.89    nan 0.895   nan 0.89    nan 0.89    nan 0.89
   nan 0.89    nan 0.89    nan 0.76    nan 0.88    nan 0.86    nan 0.88
   nan 0.885   nan 0.88    nan 0.89    nan 0.88    nan 0.88    nan 0.88
   nan 0.71    nan 0.845   nan 0.83    nan 0.845   nan 0.855   nan 0.845
   nan 0.865   nan 0.845   nan 0.865   nan 0.845   nan 0.725   nan 0.88
   nan 0.835   nan 0.88    nan 0.88    nan 0.88    nan 0.885   nan 0.88
   nan 0.89    nan 0.88    nan 0.76    nan 0.88    nan 0.86    nan 0.88
   nan 0.885   nan 0.88    nan 0.89    nan 0.88    nan 0.88    nan 0.88
   nan 0.69    nan 0.855   nan 0.815   nan 0.855   nan 0.835   nan 0.855
   nan 0.87    nan 0.855   nan 0.87    nan 0.855   nan 0.725   nan 0.88
   nan 0.835   nan 0.88    nan 0.88    nan 0.88    nan 0.885   nan 0.88
   nan 0.89    nan 0.88    nan 0.76    nan 0.88    nan 0.86    nan 0.88
   nan 0.885   nan 0.88    nan 0.89    nan 0.88    nan 0.88   

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 1, 'lr__penalty': 'l2'}.
 Score: 0.8950000000000001


## Lemmatization and Stop Word Removal

In [13]:
train = training_set.copy()
train['textdata'] = clean_text(train['name'] + ' ' + train['description'] + ' ' + train['recent_100_statuses'])
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
train['textdata'] = train['textdata'].apply(lambda row: remove_stopwords(row))
train['textdata'] = train['textdata'].apply(lambda row: ' '.join(row))
X = train
y = train.hotel

In [14]:
def get_text_data_(df):
    return df.textdata

get_text_data = FunctionTransformer(get_text_data_)


def get_numeric_data_(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_numeric_data_)




print('------------------ Support Vector Machine -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('svm', svm.SVC())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'svm__C' : [0.1,0.5,1,5,10],
              'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ kNN -------------------\n')

pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('knn', KNeighborsClassifier())
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
              'knn__weights': ['uniform', 'distance']
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')


print('\n\n------------------ Logistic Regression -------------------\n')


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer()),
            ]))
         ])),
     ('lr', LogisticRegression(max_iter=1000))
])


# Paramters for optimization
parameters = {'features__text_features__vectorizer__max_df': [0.5, 0.75, 1],
              'features__text_features__vectorizer__min_df': [1, 5, 10],
              'features__text_features__vectorizer__max_features': [1000, 2000, None],
              'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'lr__C': [0.1, 0.5, 1, 5, 10]
                  }

grid = GridSearchCV(pipeline, parameters, n_jobs = 4, verbose=1)
grid.fit(X, y)
    
print(f' Best Params: {grid.best_params_}.\n Score: {grid.best_score_}')

del train

------------------ Support Vector Machine -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.86  0.56  0.645 0.535 0.86  0.56  0.64  0.535 0.755 0.565 0.645 0.665
 0.885 0.56  0.65  0.565 0.885 0.56  0.655 0.535 0.87  0.555 0.645 0.535
 0.855 0.56  0.645 0.535 0.8   0.565 0.645 0.665 0.89  0.56  0.65  0.565
 0.885 0.56  0.655 0.535 0.88  0.56  0.645 0.535 0.855 0.56  0.645 0.535
 0.71  0.565 0.645 0.665 0.885 0.56  0.65  0.565 0.865 0.56  0.655 0.535
 0.865 0.56  0.645 0.535 0.87  0.56  0.64  0.535 0.74  0.565 0.645 0.665
 0.88  0.56  0.65  0.565 0.895 0.56  0.655 0.535 0.87  0.555 0.645 0.535
 0.855 0.56  0.645 0.535 0.8   0.565 0.645 0.665 0.89  0.56  0.65  0.565
 0.885 0.56  0.655 0.535 0.88  0.56  0.645 0.535 0.855 0.56  0.645 0.535
 0.685 0.565 0.645 0.665 0.865 0.56  0.65  0.56  0.88  0.56  0.655 0.535
 0.88  0.555 0.645 0.535 0.885 0.555 0.64  0.535 0.74  0.565 0.645 0.665
 0.88  0.56  0.65  0.565 0.895 0.56  0.655 0.535 0.87  0.555 0.645 0.535
 0.855 0.56  0.645 0.535 0.8   0.565 0.645 0.665 0.89  0.56  0.65  0.565
 0.885 0.56  0.655 0.535 0.88  0.56  0.645 0.535 0.

 Best Params: {'features__text_features__vectorizer__max_df': 0.75, 'features__text_features__vectorizer__max_features': 2000, 'features__text_features__vectorizer__min_df': 1, 'svm__C': 10, 'svm__kernel': 'linear'}.
 Score: 0.9


------------------ kNN -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


 0.7   0.715 0.7   0.715 0.68  0.695 0.685 0.7   0.59  0.59  0.6   0.64
 0.665 0.655 0.68  0.675 0.685 0.68  0.685 0.66  0.705 0.71  0.71  0.705
 0.69  0.69  0.685 0.695 0.59  0.59  0.595 0.64  0.665 0.655 0.67  0.675
 0.685 0.685 0.68  0.66  0.715 0.71  0.705 0.7   0.695 0.685 0.695 0.69
 0.605 0.605 0.59  0.65  0.665 0.67  0.68  0.675 0.675 0.68  0.685 0.65
 0.7   0.71  0.715 0.695 0.685 0.69  0.685 0.69  0.605 0.605 0.61  0.655
 0.67  0.66  0.67  0.69  0.685 0.685 0.695 0.665 0.715 0.725 0.72  0.695
 0.69  0.69  0.685 0.69  0.59  0.59  0.595 0.64  0.665 0.655 0.67  0.675
 0.685 0.685 0.68  0.66  0.715 0.71  0.705 0.7   0.695 0.685 0.695 0.69
 0.71  0.71  0.595 0.71  0.67  0.665 0.68  0.675 0.695 0.695 0.7   0.69
 0.72  0.725 0.7   0.685 0.68  0.68  0.685 0.69  0.605 0.605 0.61  0.655
 0.67  0.66  0.67  0.69  0.685 0.685 0.695 0.665 0.715 0.725 0.72  0.695
 0.69  0.69  0.685 0.69  0.59  0.59  0.595 0.64  0.665 0.655 0.67  0.675
 0.685 0.685 0.68  0.66  0.715 0.71  0.705 0.7   0.695 0

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 2000, 'features__text_features__vectorizer__min_df': 5, 'knn__n_neighbors': 7, 'knn__weights': 'distance'}.
 Score: 0.725


------------------ Logistic Regression -------------------

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


   nan 0.885   nan 0.875   nan 0.885   nan 0.875   nan 0.73    nan 0.87
   nan 0.86    nan 0.87    nan 0.885   nan 0.87    nan 0.89    nan 0.87
   nan 0.9     nan 0.87    nan 0.77    nan 0.845   nan 0.865   nan 0.845
   nan 0.89    nan 0.845   nan 0.885   nan 0.845   nan 0.89    nan 0.845
   nan 0.72    nan 0.84    nan 0.835   nan 0.84    nan 0.855   nan 0.84
   nan 0.87    nan 0.84    nan 0.87    nan 0.84    nan 0.725   nan 0.875
   nan 0.85    nan 0.875   nan 0.88    nan 0.875   nan 0.885   nan 0.875
   nan 0.89    nan 0.875   nan 0.77    nan 0.845   nan 0.865   nan 0.845
   nan 0.89    nan 0.845   nan 0.885   nan 0.845   nan 0.89    nan 0.845
   nan 0.695   nan 0.86    nan 0.825   nan 0.86    nan 0.845   nan 0.86
   nan 0.865   nan 0.86    nan 0.87    nan 0.86    nan 0.725   nan 0.875
   nan 0.85    nan 0.875   nan 0.88    nan 0.875   nan 0.885   nan 0.875
   nan 0.89    nan 0.875   nan 0.77    nan 0.845   nan 0.865   nan 0.845
   nan 0.89    nan 0.845   nan 0.885   nan 0.845   nan 

 Best Params: {'features__text_features__vectorizer__max_df': 0.5, 'features__text_features__vectorizer__max_features': 1000, 'features__text_features__vectorizer__min_df': 5, 'lr__C': 10, 'lr__penalty': 'l2'}.
 Score: 0.9


### Export Model

In [15]:
X = training_set
y = training_set.hotel

In [16]:
def get_hotel_text_data_(df):
    df = df.copy()
    df['textdata'] = clean_text(df['name']+ ' ' + df['description'] + ' ' + df['recent_100_statuses'])
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata

get_text_data = FunctionTransformer(get_hotel_text_data_)


def get_hotel_numeric_data_(df):
    data = df['friends_hotel_count_1000'].to_numpy()
    return data.reshape(-1,1)

get_numeric_data = FunctionTransformer(get_hotel_numeric_data_)


pipeline = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector_num', get_numeric_data)
            ])),
             ('text_features', Pipeline([
                ('selector_text', get_text_data),
                ('vectorizer', TfidfVectorizer(max_df=0.75, max_features=2000, min_df=1)),
            ]))
         ])),
     ('svm', svm.SVC(kernel='linear', C=10))
])


pipeline.fit(X, y)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('numeric_features',
                                                 Pipeline(steps=[('selector_num',
                                                                  FunctionTransformer(func=<function get_hotel_numeric_data_ at 0x7ff070c15dc0>))])),
                                                ('text_features',
                                                 Pipeline(steps=[('selector_text',
                                                                  FunctionTransformer(func=<function get_hotel_text_data_ at 0x7ff070c15940>)),
                                                                 ('vectorizer',
                                                                  TfidfVectorizer(max_df=0.75,
                                                                                  max_features=2000))]))])),
                ('svm', SVC(C=10, kernel='linear'))])

In [17]:
filename = '../classifiers/classifier_hotel_ndtfr.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_hotel_ndtfr.sav']