# Classification Experiment: Tweets
---
This Notebook, includes a series of experiments, on using a node's tweets for classification.

Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib

Twitter API Authentication:

In [2]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True, timeout=600)

Functions:

In [2]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

# Function For Support Vector Machine
def classification_svm(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('svm', svm.SVC())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('svm', svm.SVC())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'svm__C' : [0.1,0.5,1,5,10],
                  'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For Logistic Regression
def classification_lr(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                  'lr__C': [0.1, 0.5, 1, 5, 10]
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For kNN
def classification_knn(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
                  'knn__weights': ['uniform', 'distance']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_


def get_text_data_nd(df):
    df['textdata'] = clean_text(df['name'] + ' ' + df['description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata


# Dataset
---

To begin with, we read our datasets, and fetch some tweets for each node creating 3 new fields:
- recent_tweet
- recent_10_tweets
- recent_100_tweets

In [3]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Four-categories/four-categories-training-set.csv')
training_set = training_set.replace(np.nan, '')

In [4]:
training_set.head()

Unnamed: 0,Username,Profile name,Description,Recent Tweet,Recent 10 tweets,Recent 50 tweets,Recent 100 tweets,Tweets count,Favourites count,Followers count,Following count,Lists count,Created at,Category
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,A hotel’s operation is much more than what gue...,✨2021 Season's Greetings from all of us ✨We w...,✨2021 Season's Greetings from all of us ✨We w...,✨2021 Season's Greetings from all of us ✨We w...,1810,956,2237,1558,113,2009-07-20 08:56:13,Tourism
1,IasonFotilas,Iasonas Fotilas,Βουλευτής ΝΔ Αχαΐας,Τι να κάνουμε για να στηρίξουμε οικονομικά τον...,Απάντησή μου για την επίσκεψη του Προέδρου τη...,Απάντησή μου για την επίσκεψη του Προέδρου τη...,Απάντησή μου για την επίσκεψη του Προέδρου τη...,5570,3251,4667,1512,51,2015-02-26 07:45:34,Politics
2,hellenictourism,Tourism Society,"We promote the Greek Tourism Industry, we brin...","Ultimately, it’s not about who you know ... bu...","Ultimately, it’s not about who you know ... b...","Ultimately, it’s not about who you know ... b...","Ultimately, it’s not about who you know ... b...",140,0,1318,84,28,2009-10-13 07:50:27,Tourism
3,atsipras,Αλέξης Τσίπρας - Alexis Tsipras,Πρόεδρος του ΣΥΡΙΖΑ - @syriza_gr Ι Internation...,Η κατάσταση στη βόρεια Ελλάδα είναι δραματική....,"Πιστεύω βαθιά, ότι η ειλικρίνεια και η αφοσίω...","Πιστεύω βαθιά, ότι η ειλικρίνεια και η αφοσίω...","Πιστεύω βαθιά, ότι η ειλικρίνεια και η αφοσίω...",7692,7,552743,183,1734,2011-07-13 11:08:10,Politics
4,Bistro45Bexhill,Bistro 45,Family Run Bistro on The Marina in Bexhill-on-...,MPs have voted for a fantasy. It’s an indictme...,MPs have voted for a fantasy. It’s an indictm...,MPs have voted for a fantasy. It’s an indictm...,MPs have voted for a fantasy. It’s an indictm...,245,35,753,1067,19,2010-11-09 16:22:45,Foodservice


# Case 1: name + description + tweets
---
In this case, we use a node's name, description and tweets as a single feature to classify the node.

## Text Normalization
We start by creating 3 new fields:
- textdata_1 : name + description + recent_tweet
- textdata_2 : name + description + recent_10_tweets
- textdata_3 : name + description + recent_100_tweets

In [16]:
data = training_set.copy()
data['textdata_1'] = data['Profile name'] + ' ' + data['Description'] + ' ' + data['Recent Tweet']
data['textdata_2'] = data['Profile name'] + ' ' + data['Description'] + ' ' + data['Recent 10 tweets']
data['textdata_3'] = data['Profile name'] + ' ' + data['Description'] + ' ' + data['Recent 100 tweets']
data = data.drop(['Profile name', 'Description', 'Recent Tweet', 'Recent 10 tweets', 'Recent 100 tweets'], axis = 1)
codes = {'Tourism':0, 'Foodservice':1, 'Politics':2, 'Education': 4}
data['Category'] = data['Category'].map(codes)
data.head(3)

Unnamed: 0,Username,Recent 50 tweets,Tweets count,Favourites count,Followers count,Following count,Lists count,Created at,Category,textdata_1,textdata_2,textdata_3
0,aldemar_resorts,✨2021 Season's Greetings from all of us ✨We w...,1810,956,2237,1558,113,2009-07-20 08:56:13,0,Aldemar Resorts Guest satisfaction is our top ...,Aldemar Resorts Guest satisfaction is our top ...,Aldemar Resorts Guest satisfaction is our top ...
1,IasonFotilas,Απάντησή μου για την επίσκεψη του Προέδρου τη...,5570,3251,4667,1512,51,2015-02-26 07:45:34,2,Iasonas Fotilas Βουλευτής ΝΔ Αχαΐας Τι να κάνο...,Iasonas Fotilas Βουλευτής ΝΔ Αχαΐας Απάντησή ...,Iasonas Fotilas Βουλευτής ΝΔ Αχαΐας Απάντησή ...
2,hellenictourism,"Ultimately, it’s not about who you know ... b...",140,0,1318,84,28,2009-10-13 07:50:27,0,Tourism Society We promote the Greek Tourism I...,Tourism Society We promote the Greek Tourism I...,Tourism Society We promote the Greek Tourism I...


Next normalize our text by taking the following actions:

- remove URLs
- remove Mentions
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [17]:
data['textdata_1'] = clean_text(data['textdata_1'])
data['textdata_2'] = clean_text(data['textdata_2'])
data['textdata_3'] = clean_text(data['textdata_3'])

## Model Selection

In [18]:
svm_tfidf = pd.DataFrame()
svm_bow = pd.DataFrame()

lr_tfidf = pd.DataFrame()
lr_bow = pd.DataFrame()

knn_tfidf = pd.DataFrame()
knn_bow = pd.DataFrame()

### Without NLP

In [19]:
textdatas = ['textdata_1', 'textdata_2', 'textdata_3']

#### SVM

In [20]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 5, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 10, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9567,0.9467,0.9533


In [21]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 0.1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 10, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9167,0.9267,0.9133


#### Logistic Regression

In [22]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 25}




Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9667,0.9467,0.95


In [23]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.94,0.9333,0.9567


#### kNN

In [24]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 7, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 9, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.93,0.92,0.9133


In [25]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 5, 'knn__weights': 'distance', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 6, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 10}
Best params for textdata_3:




{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.6667,0.6433,0.82


### Stop Word Removal

In [5]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [27]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

df['textdata_2'] = df['textdata_2'].apply(lambda row: remove_stopwords(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))

df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [28]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 0.5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9567,0.9467,0.9533
Stopword Removal,0.9733,0.9567,0.9567


In [29]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 0.1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9167,0.9267,0.9133
Stopword Removal,0.9333,0.9267,0.92


#### Logistic Regression

In [30]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}




Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9667,0.9467,0.95
Stopword Removal,0.9767,0.9633,0.96


In [31]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 10}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.94,0.9333,0.9567
Stopword Removal,0.96,0.9533,0.94


#### kNN

In [32]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 7, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 9, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 7, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.93,0.92,0.9133
Stopword Removal,0.96,0.9367,0.94


In [33]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 2, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 1, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 10}
Best params for textdata_3:




{'knn__n_neighbors': 5, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.6667,0.6433,0.82
Stopword Removal,0.7567,0.71,0.83


### Lemmatization

In [6]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

In [35]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

df['textdata_2'] = df['textdata_2'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: remove_stopwords(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))

df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [36]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 0.5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9567,0.9467,0.9533
Stopword Removal,0.9733,0.9567,0.9567
Lemmatization,0.9733,0.9633,0.9567


In [37]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9167,0.9267,0.9133
Stopword Removal,0.9333,0.9267,0.92
Lemmatization,0.9433,0.9333,0.9267


#### Logistic Regression

In [38]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}




Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.9667,0.9467,0.95
Stopword Removal,0.9767,0.9633,0.96
Lemmatization,0.9733,0.9633,0.9633


In [39]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 10}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.94,0.9333,0.9567
Stopword Removal,0.96,0.9533,0.94
Lemmatization,0.9633,0.9433,0.95


#### kNN

In [40]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 10, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.93,0.92,0.9133
Stopword Removal,0.96,0.9367,0.94
Lemmatization,0.96,0.9367,0.9367


In [41]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)
knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 3, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 3, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 5}
Best params for textdata_3:




{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.6667,0.6433,0.82
Stopword Removal,0.7567,0.71,0.83
Lemmatization,0.7667,0.71,0.8367


### Export Model
- TF-IDF outperformed BoW with in every case.
- Highest Accuracy was achieved using recent 100 tweets
- Support Vector Machine and Logistic Regression have a better performance.
- Logistic Regression Reached the highest achieved accuracy with less NLP steps than Support Vector Machines

The best models we found are: Logistic Regression-TF-IDF and Support Vector Machines - TF-IDF.
We chose logistic regression as is required less preprocessing steps.

- vectorizer__max_df: 0.75
- vectorizer__max_features: None
- vectorizer__min_df: 1
- lr__penalty: None<br>

with the following NLP steps:
- Lemmatization
- Stop Word Removal

In [7]:
def get_text_data_ndt(df):
    df['textdata'] = clean_text(df['Profile name'] + ' ' + df['Description'] + ' ' + df['Recent 100 tweets'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata


get_text_ndt = FunctionTransformer(get_text_data_ndt)

In [8]:
pipeline = Pipeline([
    ('selector_ndt', get_text_ndt),
    ('tfidf', TfidfVectorizer(max_df=0.75, max_features=2000, min_df=5)),
    ('lr', LogisticRegression(max_iter=1000, penalty = 'none'))
])

In [9]:
X = training_set
y = training_set.Category
pipeline.fit(X,y)

Pipeline(steps=[('selector_ndt',
                 FunctionTransformer(func=<function get_text_data_ndt at 0x7f82f73eeca0>)),
                ('tfidf',
                 TfidfVectorizer(max_df=0.75, max_features=2000, min_df=5)),
                ('lr', LogisticRegression(max_iter=1000, penalty='none'))])

In [10]:
filename = '../classifiers/classifier_fourcateg_ndt.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_fourcateg_ndt.sav']

# Case 2: Tweets Only
---
In this case we fetch 100 tweets if possible for each node, and try to classify them using only their tweets.

## Text Normalization

In [42]:
data = training_set.copy()
data['textdata_1'] = data['Recent Tweet']
data['textdata_2'] = data['Recent 10 tweets']
data['textdata_3'] = data['Recent 100 tweets']
data = data.drop(['Profile name', 'Description', 'Recent Tweet', 'Recent 10 tweets', 'Recent 100 tweets'], axis = 1)
codes = {'Tourism':0, 'Foodservice':1, 'Politics':2, 'Education': 4}
data['Category'] = data['Category'].map(codes)
data.head(3)

Unnamed: 0,Username,Recent 50 tweets,Tweets count,Favourites count,Followers count,Following count,Lists count,Created at,Category,textdata_1,textdata_2,textdata_3
0,aldemar_resorts,✨2021 Season's Greetings from all of us ✨We w...,1810,956,2237,1558,113,2009-07-20 08:56:13,0,A hotel’s operation is much more than what gue...,✨2021 Season's Greetings from all of us ✨We w...,✨2021 Season's Greetings from all of us ✨We w...
1,IasonFotilas,Απάντησή μου για την επίσκεψη του Προέδρου τη...,5570,3251,4667,1512,51,2015-02-26 07:45:34,2,Τι να κάνουμε για να στηρίξουμε οικονομικά τον...,Απάντησή μου για την επίσκεψη του Προέδρου τη...,Απάντησή μου για την επίσκεψη του Προέδρου τη...
2,hellenictourism,"Ultimately, it’s not about who you know ... b...",140,0,1318,84,28,2009-10-13 07:50:27,0,"Ultimately, it’s not about who you know ... bu...","Ultimately, it’s not about who you know ... b...","Ultimately, it’s not about who you know ... b..."


We normalize our text by taking the following actions:

- remove URLs
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [43]:
data['textdata_1'] = clean_text(data['textdata_1'])
data['textdata_2'] = clean_text(data['textdata_2'])
data['textdata_3'] = clean_text(data['textdata_3'])

## Model Selection

In [44]:
textdatas = ['textdata_1', 'textdata_2', 'textdata_3']

In [45]:
svm_tfidf = pd.DataFrame()
svm_bow = pd.DataFrame()

lr_tfidf = pd.DataFrame()
lr_bow = pd.DataFrame()

knn_tfidf = pd.DataFrame()
knn_bow = pd.DataFrame()

#### SVM

In [46]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 1, 'svm__kernel': 'poly', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 10, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.53,0.8833,0.93


In [47]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4833,0.8333,0.91


#### Logistic Regression

In [48]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}




Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5333,0.88,0.93


In [49]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5033,0.8467,0.9533


#### kNN

In [50]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 9, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 10, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 9, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4633,0.74,0.89


In [51]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 7, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 10, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}
Best params for textdata_3:




{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 25}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.3933,0.52,0.8133


### Stop Word Removal

In [52]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [53]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

df['textdata_2'] = df['textdata_2'].apply(lambda row: remove_stopwords(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))

df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [54]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 0.5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.53,0.8833,0.93
Stopword Removal,0.5233,0.89,0.9467


In [55]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 0.5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4833,0.8333,0.91
Stopword Removal,0.4767,0.86,0.9167


#### Logistic Regression

In [56]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 10, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5333,0.88,0.93
Stopword Removal,0.53,0.89,0.9467


In [57]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5033,0.8467,0.9533
Stopword Removal,0.5,0.86,0.9367


#### kNN

In [58]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 8, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 10, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 9, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4633,0.74,0.89
Stopword Removal,0.4633,0.76,0.9233


In [59]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 1, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 5}
Best params for textdata_2:




{'knn__n_neighbors': 3, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 10}
Best params for textdata_3:




{'knn__n_neighbors': 7, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.3933,0.52,0.8133
Stopword Removal,0.35,0.52,0.82


### Lemmatization

In [60]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

In [61]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

df['textdata_2'] = df['textdata_2'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: remove_stopwords(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_2'] = df['textdata_2'].apply(lambda row: ' '.join(row))

df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: remove_stopwords(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_3'] = df['textdata_3'].apply(lambda row: ' '.join(row))

#### SVM

In [62]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.53,0.8833,0.93
Stopword Removal,0.5233,0.89,0.9467
Lemmatization,0.5767,0.8967,0.9467


In [63]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'svm__C': 0.1, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4833,0.8333,0.91
Stopword Removal,0.4767,0.86,0.9167
Lemmatization,0.5067,0.87,0.92


#### Logistic Regression

In [64]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 0.5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.75, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5333,0.88,0.93
Stopword Removal,0.53,0.89,0.9467
Lemmatization,0.5867,0.8933,0.95


In [65]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'lr__C': 5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'lr__C': 0.5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': None, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': None, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.5033,0.8467,0.9533
Stopword Removal,0.5,0.86,0.9367
Lemmatization,0.5467,0.8833,0.9467


#### kNN

In [66]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 10, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}
Best params for textdata_3:




{'knn__n_neighbors': 9, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 5}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.4633,0.74,0.89
Stopword Removal,0.4633,0.76,0.9233
Lemmatization,0.49,0.78,0.9167


In [67]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)
knn_bow

  0%|          | 0/3 [00:00<?, ?it/s]

Best params for textdata_1:




{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}
Best params for textdata_2:




{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 5}
Best params for textdata_3:




{'knn__n_neighbors': 5, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.5, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}


Unnamed: 0,textdata_1,textdata_2,textdata_3
Without NLP,0.3933,0.52,0.8133
Stopword Removal,0.35,0.52,0.82
Lemmatization,0.35,0.5533,0.8333


### Export Model
- TF-IDF outperformed BoW with in every case except with Logistic Regression, where they had similar performances
- Logistic Regression had the best performance, and kNN the worst

The best model we found is: Logistic Regression-TF-IDF
- vectorizer__max_df: 0.75
- vectorizer__max_features: None
- vectorizer__min_df: 5
- lr__C: 0.1
- lr__penalty: none<br>

with the following NLP steps:
- Stop Word Removal


In [11]:
def get_text_data_t(df):
    df['textdata'] = clean_text(df['Recent 100 tweets'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    return df.textdata


get_text_t = FunctionTransformer(get_text_data_t)

In [12]:
pipeline = Pipeline([
    ('selector_t', get_text_t),
    ('tfidf', TfidfVectorizer(max_df=0.75, max_features=2000, min_df=5)),
    ('lr', LogisticRegression(max_iter=1000, penalty = 'l2', C=1))
])

In [14]:
X = training_set
y = training_set.Category
pipeline.fit(X,y)

Pipeline(steps=[('selector_t',
                 FunctionTransformer(func=<function get_text_data_t at 0x7f82f71ace50>)),
                ('tfidf',
                 TfidfVectorizer(max_df=0.75, max_features=2000, min_df=5)),
                ('lr', LogisticRegression(C=1, max_iter=1000))])

In [15]:
filename = '../classifiers/classifier_fourcateg_t.sav'
joblib.dump(pipeline, filename)

['../classifiers/classifier_fourcateg_t.sav']