# Classification Experiment: Tweets
---
This Notebook, includes a series of experiments, on using a node's tweets for classification.

Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib

Twitter API Authentication:

In [2]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=600)

Functions:

In [4]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

# Function For Support Vector Machine
def classification_svm(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('svm', svm.SVC())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('svm', svm.SVC())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'svm__C' : [0.1,0.5,1,5,10],
                  'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For Logistic Regression
def classification_lr(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                  'lr__C': [0.1, 0.5, 1, 5, 10]
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For kNN
def classification_knn(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
                  'knn__weights': ['uniform', 'distance']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_


def get_text_data_nd(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata


# Dataset
---

In [15]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Hotels/classification/hotels-training-set.csv',
                          usecols=['screen_name', 'name', 'description', 'recent_100_statuses', 'hotel'])
training_set = training_set.replace(np.nan, '')

In [16]:
training_set.head()

Unnamed: 0,screen_name,name,description,recent_100_statuses,hotel
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,Summer vacation is meant to make you feel ⛱ r...,1
1,AquaVistaHotels,Aqua Vista Hotels,A compilation of extraordinary hotels catering...,Thank you Greek Travel Pages for highlighting...,1
2,Eurobank_Group,Eurobank,Καλωσήρθατε στην επίσημη σελίδα της Eurobank σ...,"Η Eurobank ενημερώνει ότι τα συστήματά της, κ...",0
3,white_suites,White Suites Resort,White Suites Resort is a luxury beach hotel in...,"Sea side holidays in Afytos, Halikidiki White...",1
4,KarenMillen,Karen Millen,"Timeless, elevated ready-to-wear style for women.",The future's bright.\nhttps://t.co/XLpskBYi4u...,0


# Case 1: name + description + tweets
---
In this case, we use a node's name, description and tweets as a single feature to classify the node.

## Text Normalization
We start by creating 3 new fields:
- textdata_1 : name + description + recent_tweet
- textdata_2 : name + description + recent_10_tweets
- textdata_3 : name + description + recent_100_tweets

In [17]:
data = training_set.copy()
data['textdata_3'] = data['name'] + ' ' + data['description'] + ' ' + data['recent_100_statuses']
data = data.drop(['name', 'description', 'recent_100_statuses'], axis = 1)
data.head(3)

Unnamed: 0,screen_name,hotel,textdata_3
0,aldemar_resorts,1,Aldemar Resorts Guest satisfaction is our top ...
1,AquaVistaHotels,1,Aqua Vista Hotels A compilation of extraordina...
2,Eurobank_Group,0,Eurobank Καλωσήρθατε στην επίσημη σελίδα της E...


Next normalize our text by taking the following actions:

- remove URLss
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [18]:
data['textdata_3'] = clean_text(data['textdata_3'])

## Model Selection

In [19]:
svm_tfidf = pd.DataFrame()
lr_tfidf = pd.DataFrame()
knn_tfidf = pd.DataFrame()

### Without NLP

In [20]:
textdatas = ['textdata_3']

#### SVM

In [21]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 10}


Unnamed: 0,textdata_3
Without NLP,0.905


#### Logistic Regression

In [22]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_3
Without NLP,0.875


#### kNN

In [23]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'knn__n_neighbors': 8, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_3
Without NLP,0.865


### Stop Word Removal

In [24]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [25]:
df = data.copy()

df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [26]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_3
Without NLP,0.905
Stopword Removal,0.91


#### Logistic Regression

In [27]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_3
Without NLP,0.875
Stopword Removal,0.9


#### kNN

In [28]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'knn__n_neighbors': 8, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_3
Without NLP,0.865
Stopword Removal,0.875


### Lemmatization

In [29]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

In [30]:
df = data.copy()

df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [31]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 100, 'vectorizer__min_df': 10}


Unnamed: 0,textdata_3
Without NLP,0.905
Stopword Removal,0.91
Lemmatization,0.92


#### Logistic Regression

In [32]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_3
Without NLP,0.875
Stopword Removal,0.9
Lemmatization,0.91


#### kNN

In [33]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:
{'knn__n_neighbors': 8, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_3
Without NLP,0.865
Stopword Removal,0.875
Lemmatization,0.87


### Named Entity Removal

In [34]:
def remove_named_entities_en(row):
    return [str(token) for token in nlp_en(row) if token.ent_type_ not in set(['NORP', 'GPE'])]

def remove_named_entities_el(row):
    return [str(token) for token in nlp_el(row) if token.ent_type_ not in set(['NORP', 'GPE'])]

In [35]:
df = data.copy()

df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_named_entities_en(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_named_entities_el(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [36]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 100, 'vectorizer__min_df': 10}


Unnamed: 0,textdata_3
Without NLP,0.905
Stopword Removal,0.91
Lemmatization,0.92
Named Entities Removal,0.92


#### Logistic Regression

In [37]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_3
Without NLP,0.875
Stopword Removal,0.9
Lemmatization,0.91
Named Entities Removal,0.91


#### kNN

In [38]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Named Entities Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_3:
{'knn__n_neighbors': 8, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_3
Without NLP,0.865
Stopword Removal,0.875
Lemmatization,0.87
Named Entities Removal,0.87


### Export Model
- TF-IDF outperformed BoW with in every case.
- Highest Accuracy was achieved using recent 100 tweets
- Support Vector Machine and Logistic Regression have a better performance.
- Logistic Regression Reached the highest achieved accuracy with less NLP steps than Support Vector Machines

The best models we found are: Support Vector Machines - TF-IDF.

- vectorizer__max_df: 0.75
- vectorizer__max_features: 100
- vectorizer__min_df: 10
- svm__C: 1
- svm__kernel: rbf<br>

with the following NLP steps:
- Lemmatization
- Stop Word Removal


In [68]:
def get_text_data_hotel_ndt(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'] + ' ' + df['recent_100_statuses'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata


get_text_ndt = FunctionTransformer(get_text_data_hotel_ndt)

In [70]:
pipeline = Pipeline([
    ('selector_ndt', get_text_ndt),
    ('tfidf', TfidfVectorizer(max_df=0.75, max_features=100, min_df=10)),
    ('svm', svm.SVC(kernel='rbf', C=1))
])

In [71]:
X = training_set
y = training_set.hotel
pipeline.fit(X,y)

Pipeline(steps=[('selector_ndt',
                 FunctionTransformer(func=<function get_text_data_hotel_ndt at 0x7fbb1cb24040>)),
                ('tfidf',
                 TfidfVectorizer(max_df=0.75, max_features=100, min_df=10)),
                ('svm', SVC(C=1))])

In [72]:
filename = '../classifiers/classifier_hotel_ndt.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_hotel_ndt.sav']

# Case 2: Tweets Only
---
In this case we fetch 100 tweets if possible for each node, and try to classify them using only their tweets.

## Text Normalization

In [45]:
data = training_set.copy()
data.head(3)

Unnamed: 0,screen_name,name,description,recent_100_statuses,hotel
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,Summer vacation is meant to make you feel ⛱ r...,1
1,AquaVistaHotels,Aqua Vista Hotels,A compilation of extraordinary hotels catering...,Thank you Greek Travel Pages for highlighting...,1
2,Eurobank_Group,Eurobank,Καλωσήρθατε στην επίσημη σελίδα της Eurobank σ...,"Η Eurobank ενημερώνει ότι τα συστήματά της, κ...",0


We normalize our text by taking the following actions:

- remove URLs
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [46]:
data['recent_100_statuses'] = clean_text(data['recent_100_statuses'])

## Model Selection

In [47]:
textdatas = ['recent_100_statuses']

In [48]:
svm_tfidf = pd.DataFrame()
lr_tfidf = pd.DataFrame()
knn_tfidf = pd.DataFrame()

#### SVM

In [49]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 10}


Unnamed: 0,recent_100_statuses
Without NLP,0.895


#### Logistic Regression

In [50]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'lr__C': 10, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}


Unnamed: 0,recent_100_statuses
Without NLP,0.85


#### kNN

In [51]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'knn__n_neighbors': 4, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 100, 'vectorizer__min_df': 25}


Unnamed: 0,recent_100_statuses
Without NLP,0.825


### Stop Word Removal

In [52]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [53]:
df = data.copy()

df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: remove_stopwords(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: ' '.join(row))


#### SVM

In [54]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'svm__C': 5, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}


Unnamed: 0,recent_100_statuses
Without NLP,0.895
Stopword Removal,0.895


#### Logistic Regression

In [55]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'lr__C': 10, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,recent_100_statuses
Without NLP,0.85
Stopword Removal,0.88


#### kNN

In [56]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:
{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 10, 'vectorizer__min_df': 1}




Unnamed: 0,recent_100_statuses
Without NLP,0.825
Stopword Removal,0.815


### Lemmatization

In [57]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

In [59]:
df = data.copy()

df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: tokenize_lemmatize(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: ' '.join(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: tokenize_lemmatize_en(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: ' '.join(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: remove_stopwords(row))
df['recent_100_statuses'] = df['recent_100_statuses'].apply(lambda row: ' '.join(row))

#### SVM

In [60]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,recent_100_statuses
Without NLP,0.895
Stopword Removal,0.895
Lemmatization,0.905


#### Logistic Regression

In [61]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:
{'lr__C': 10, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 10}




Unnamed: 0,recent_100_statuses
Without NLP,0.85
Stopword Removal,0.88
Lemmatization,0.88


#### kNN

In [62]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['hotel'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for recent_100_statuses:
{'knn__n_neighbors': 9, 'knn__weights': 'distance', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 10, 'vectorizer__min_df': 25}




Unnamed: 0,recent_100_statuses
Without NLP,0.825
Stopword Removal,0.815
Lemmatization,0.835


### Export Model
- TF-IDF outperformed BoW with in every case except with Logistic Regression, where they had similar performances
- Logistic Regression had the best performance, and kNN the worst

The best model we found is: SVM-TF-IDF
- vectorizer__max_df: 0.75
- vectorizer__max_features: 2000
- vectorizer__min_df: 5
- svm__C: 1
- svm__kernel: rbf<br>

with the following NLP steps:
- Stop Word Removal
- Lemmatization

In [64]:
def get_text_data_hotel_t(df):
    df['textdata'] = clean_text(df['recent_100_statuses'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata


get_text_t = FunctionTransformer(get_text_data_hotel_t)

In [65]:
pipeline = Pipeline([
    ('selector_t', get_text_t),
    ('tfidf', TfidfVectorizer(max_df=0.75, max_features=None, min_df=5)),
    ('svm', svm.SVC(kernel='rbf', C=1))
])

In [66]:
X = training_set
y = training_set.hotel
pipeline.fit(X,y)

Pipeline(steps=[('selector_t',
                 FunctionTransformer(func=<function get_text_data_hotel_t at 0x7fbb3dba0670>)),
                ('tfidf', TfidfVectorizer(max_df=0.75, min_df=5)),
                ('svm', SVC(C=1))])

In [67]:
filename = '../classifiers/classifier_hotel_t.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_hotel_t.sav']