# Classification Experiment: Name + Description
---
This Notebook, includes a series of experiments, on using a node's name and description for classification.

Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib

Twitter API Authentication:

In [2]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth)

Functions:

In [3]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

# Function For Support Vector Machine
def classification_svm(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('svm', svm.SVC())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('svm', svm.SVC())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'svm__C' : [0.1,0.5,1,5,10],
                  'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For Logistic Regression
def classification_lr(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                  'lr__C': [0.1, 0.5, 1, 5, 10]
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For kNN
def classification_knn(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
                  'knn__weights': ['uniform', 'distance']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_


# Dataset
---

To begin with, we read our datasets, and fetch some tweets for each node creating 3 new fields:
- recent_tweet
- recent_10_tweets
- recent_100_tweets

In [4]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set.csv', 
                           usecols=['screen_name', 'name', 'description', 'hotel'])
training_set = training_set.replace(np.nan, '')

In [5]:
training_set.head()

Unnamed: 0,screen_name,name,description,hotel
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,1
1,AquaVistaHotels,Aqua Vista Hotels,A compilation of extraordinary hotels catering...,1
2,Eurobank_Group,Eurobank,Καλωσήρθατε στην επίσημη σελίδα της Eurobank σ...,0
3,white_suites,White Suites Resort,White Suites Resort is a luxury beach hotel in...,1
4,KarenMillen,Karen Millen,"Timeless, elevated ready-to-wear style for women.",0


# Case 1: name + description 
---
In this case, we use a node's name and description  as a single feature to classify the node.

## Text Normalization
We start by creating a new field:
- textdata : name + description

In [9]:
data = training_set.copy()
data['textdata_1'] = data['name'] + ' ' + data['description']
data = data.drop(['name', 'description'], axis = 1)
data.head(3)

Unnamed: 0,screen_name,hotel,textdata_1
0,aldemar_resorts,1,Aldemar Resorts Guest satisfaction is our top ...
1,AquaVistaHotels,1,Aqua Vista Hotels A compilation of extraordina...
2,Eurobank_Group,0,Eurobank Καλωσήρθατε στην επίσημη σελίδα της E...


Next normalize our text by taking the following actions:

- remove URLs
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [10]:
data['textdata_1'] = clean_text(data['textdata_1'])

## Model Selection

In [11]:
svm_tfidf = pd.DataFrame()

lr_tfidf = pd.DataFrame()

knn_tfidf = pd.DataFrame()

### Without NLP

In [12]:
textdatas = ['textdata_1']

#### SVM

In [14]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9


#### Logistic Regression

In [15]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 10, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.895


#### kNN

In [16]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 3, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.89


### Stop Word Removal

In [13]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [18]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

#### SVM

In [19]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9
Stopword Removal,0.915


#### Logistic Regression

In [20]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.895
Stopword Removal,0.905


#### kNN

In [21]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 4, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.89
Stopword Removal,0.89


### Lemmatization

In [9]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

In [23]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

#### SVM

In [24]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9
Stopword Removal,0.915
Lemmatization,0.92


#### Logistic Regression

In [25]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.895
Stopword Removal,0.905
Lemmatization,0.905


#### kNN

In [26]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 4, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.89
Stopword Removal,0.89
Lemmatization,0.885


### Remove named entities (Country Names and Geographic Locations)

In [38]:
def remove_named_entities_en(row):
    return [str(token) for token in nlp_en(row) if token.ent_type_ not in set(['NORP', 'GPE'])]

def remove_named_entities_el(row):
    return [str(token) for token in nlp_el(row) if token.ent_type_ not in set(['NORP', 'GPE'])]

In [41]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_named_entities_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_named_entities_el(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

#### SVM

In [42]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df_2[textdata]
    results[textdata] = classification_svm(X, df_2['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9
Stopword Removal,0.915
Lemmatization,0.92
Named Entities Removal,0.92
Named Entities Removal,0.92


#### Logistic Regression

In [43]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df_2[textdata]
    results[textdata] = classification_lr(X, df_2['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.895
Stopword Removal,0.905
Lemmatization,0.905
Named Entities Removal,0.9
Named Entities Removal,0.9


#### kNN

In [44]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df_2[textdata]
    results[textdata] = classification_knn(X, df_2['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 4, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.89
Stopword Removal,0.89
Lemmatization,0.885
Named Entities Removal,0.89
Named Entities Removal,0.89


## Export Model


The best model we found is: SVM-TF-IDF
- vectorizer__max_df: 0.5
- vectorizer__max_features: 1000
- vectorizer__min_df: 1
- svm__C: 1
- svm_kernel: rbf<br>

with the following NLP steps:
- Lemmatization
- Stop Word Removal


In [14]:
def get_text_data_hotel_nd(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata


get_text = FunctionTransformer(get_text_data_hotel_nd)

In [15]:
pipeline = Pipeline([
    ('selector', get_text),
    ('tfidf', TfidfVectorizer(max_df=0.5, max_features=1000, min_df=1)),
    ('svm', svm.SVC(kernel='rbf', C=1, probability=True))
])

In [16]:
X = training_set
y = training_set['hotel']
pipeline.fit(X,y)

Pipeline(steps=[('selector',
                 FunctionTransformer(func=<function get_text_data_hotel_nd at 0x7fa6b5ebad30>)),
                ('tfidf', TfidfVectorizer(max_df=0.5, max_features=1000)),
                ('svm', SVC(C=1, probability=True))])

In [17]:
filename = 'classifier_hotel_nd.sav'
joblib.dump(pipeline, filename)

['classifier_hotel_nd.sav']