# Classification Experiment: Name + Description
---
This Notebook, includes a series of experiments, on using a node's name and description for classification.

Libraries:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import json
import tweepy
import time

from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import joblib

Twitter API Authentication:

In [2]:
twitter_credentials = []
with open('../../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)

auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'],twitter_credentials['access_token_secret'])
API = tweepy.API(auth)

Functions:

In [3]:
# Function For Text Normalization
def clean_text(data):
    urls = r'http\S+'
    non_unicode_char = r'\W'
    numbers = r'[0-9_]'
    fix_whitespace = r'\s+'
    single_whitespace = ' '
    
    data = (data.replace([urls], single_whitespace, regex=True)
                    .replace([non_unicode_char, numbers], single_whitespace, regex=True)
                    .replace(fix_whitespace, single_whitespace, regex=True))
    data = data.apply(lambda s: s.lower() if type(s) == str else s)
    return data

# Function For Support Vector Machine
def classification_svm(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('svm', svm.SVC())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('svm', svm.SVC())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'svm__C' : [0.1,0.5,1,5,10],
                  'svm__kernel':['linear', 'poly', 'rbf', 'sigmoid']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For Logistic Regression
def classification_lr(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('lr', LogisticRegression(max_iter=1000))
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'lr__penalty': ['l1', 'l2', 'elasticnet', 'none'],
                  'lr__C': [0.1, 0.5, 1, 5, 10]
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_

# Function For kNN
def classification_knn(X, y, vect):
    if vect == 'TF-IDF':
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )
    elif vect == 'BoW':
        pipeline = Pipeline([
            ('vectorizer', CountVectorizer()),
            ('knn', KNeighborsClassifier())
        ]
        )

    parameters = {'vectorizer__max_df': [0.25, 0.5, 0.75, 1],
                  'vectorizer__min_df': [1, 5, 10, 25],
                  'vectorizer__max_features': [10, 100, 1000, 2000, None],
                  'knn__n_neighbors': [1,2,3,4,5,6,7,8,9,10],
                  'knn__weights': ['uniform', 'distance']
                  }
    
    grid = GridSearchCV(pipeline, parameters, n_jobs = 4)
    grid.fit(X, y)
    
    print(grid.best_params_)
    return grid.best_score_


# Dataset
---

To begin with, we read our datasets, and fetch some tweets for each node creating 3 new fields:
- recent_tweet
- recent_10_tweets
- recent_100_tweets

In [11]:
# Read Training Set
training_set = pd.read_csv('../../../../datasets/Four-categories/four-categories-training-set.csv',
                          usecols=['Username', 'Profile name', 'Description', 'Category'])
training_set = training_set.replace(np.nan, '')

In [12]:
training_set.head()

Unnamed: 0,Username,Profile name,Description,Category
0,aldemar_resorts,Aldemar Resorts,Guest satisfaction is our top priority! *Luxur...,Tourism
1,IasonFotilas,Iasonas Fotilas,Βουλευτής ΝΔ Αχαΐας,Politics
2,hellenictourism,Tourism Society,"We promote the Greek Tourism Industry, we brin...",Tourism
3,atsipras,Αλέξης Τσίπρας - Alexis Tsipras,Πρόεδρος του ΣΥΡΙΖΑ - @syriza_gr Ι Internation...,Politics
4,Bistro45Bexhill,Bistro 45,Family Run Bistro on The Marina in Bexhill-on-...,Foodservice


# Case 1: name + description 
---
In this case, we use a node's name and description  as a single feature to classify the node.

## Text Normalization
We start by creating a new field:
- textdata : name + description

In [10]:
data = training_set.copy()
data['textdata_1'] = data['Profile name'] + ' ' + data['Description']
data = data.drop(['Profile name', 'Description'], axis = 1)
data.head(3)

Unnamed: 0,Username,Category,textdata_1
0,aldemar_resorts,Tourism,Aldemar Resorts Guest satisfaction is our top ...
1,IasonFotilas,Politics,Iasonas Fotilas Βουλευτής ΝΔ Αχαΐας
2,hellenictourism,Tourism,Tourism Society We promote the Greek Tourism I...


Next normalize our text by taking the following actions:

- remove URLs
- remove Mentions
- remove anything that isn't a unicode character (e.g emojis, punctuation)
- remove numbers and _
- fix whitespace
- convert to lower case

In [11]:
data['textdata_1'] = clean_text(data['textdata_1'])

In [13]:
codes = {'Tourism':0, 'Foodservice':1, 'Politics':2, 'Education': 4}
data['Category'] = data['Category'].map(codes)

## Model Selection

In [15]:
svm_tfidf = pd.DataFrame()
svm_bow = pd.DataFrame()

lr_tfidf = pd.DataFrame()
lr_bow = pd.DataFrame()

knn_tfidf = pd.DataFrame()
knn_bow = pd.DataFrame()

### Without NLP

In [16]:
textdatas = ['textdata_1']

#### SVM

In [17]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9533


In [18]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_svm(X, data['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

svm_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 0.5, 'svm__kernel': 'linear', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9233


#### Logistic Regression

In [19]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.96


In [20]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_lr(X, data['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

lr_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9467


#### kNN

In [22]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 10, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9433


In [21]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = data[textdata]
    results[textdata] = classification_knn(X, data['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Without NLP']).T)

knn_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 3, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.7967


### Stop Word Removal

In [5]:
nlp_el = spacy.load('el_core_news_md')
nlp_en = spacy.load('en_core_web_sm')
STOPWORDS = set(list(spacy.lang.en.STOP_WORDS) + list(spacy.lang.el.STOP_WORDS))

def remove_stopwords(row):
    row = [str(token) for token in nlp_el(row)]
    return [w for w in row if w not in STOPWORDS]

In [24]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

#### SVM

In [25]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9533
Stopword Removal,0.9733


In [26]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

svm_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 5, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9233
Stopword Removal,0.93


#### Logistic Regression

In [27]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.5, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.96
Stopword Removal,0.9733


In [28]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

lr_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9467
Stopword Removal,0.9567


#### kNN

In [29]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 9, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9433
Stopword Removal,0.9667


In [30]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Stopword Removal']).T)

knn_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 4, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.7967
Stopword Removal,0.8333


### Lemmatization

In [4]:
def tokenize_lemmatize(row):
    return [str(token.lemma_) for token in nlp_el(row)]

def tokenize_lemmatize_en(row):
    return [str(token.lemma_) for token in nlp_en(row)]

In [32]:
df = data.copy()

df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: tokenize_lemmatize_en(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: remove_stopwords(row))
df['textdata_1'] = df['textdata_1'].apply(lambda row: ' '.join(row))

#### SVM

In [33]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
svm_tfidf = svm_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'rbf', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9533
Stopword Removal,0.9733
Lemmatization,0.9833


In [35]:
# Bow 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_svm(X, df['Category'], 'BoW').round(4)
    print("============================")
    
svm_bow = svm_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

svm_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'svm__C': 1, 'svm__kernel': 'sigmoid', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9233
Stopword Removal,0.93
Lemmatization,0.9433


#### Logistic Regression

In [34]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
lr_tfidf = lr_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.1, 'lr__penalty': 'l2', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.96
Stopword Removal,0.9733
Lemmatization,0.9867


In [36]:
# BoW 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_lr(X, df['Category'], 'BoW').round(4)
    print("============================")
    
lr_bow = lr_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

lr_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'lr__C': 0.1, 'lr__penalty': 'none', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 2000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9467
Stopword Removal,0.9567
Lemmatization,0.9633


#### kNN

In [37]:
# TF-IDF 
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'TF-IDF').round(4)
    print("============================")
    
knn_tfidf = knn_tfidf.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_tfidf

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 7, 'knn__weights': 'uniform', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 1000, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.9433
Stopword Removal,0.9667
Lemmatization,0.96


In [38]:
# BoW
results = {}
for textdata in tqdm(textdatas):
    print(f'Best params for {textdata}:')
    X = df[textdata]
    results[textdata] = classification_knn(X, df['Category'], 'BoW').round(4)
    print("============================")
    
knn_bow = knn_bow.append(
    pd.DataFrame.from_dict(results, orient='index', columns=['Lemmatization']).T)

knn_bow

  0%|          | 0/1 [00:00<?, ?it/s]

Best params for textdata_1:
{'knn__n_neighbors': 3, 'knn__weights': 'distance', 'vectorizer__max_df': 0.25, 'vectorizer__max_features': 100, 'vectorizer__min_df': 1}




Unnamed: 0,textdata_1
Without NLP,0.7967
Stopword Removal,0.8333
Lemmatization,0.8467


## Export Model


The best model we found is: SVM-TF-IDF
- vectorizer__max_df: 0.5
- vectorizer__max_features: 1000
- vectorizer__min_df: 1
- svm__C: 1
- svm_kernel: rbf<br>

with the following NLP steps:
- Lemmatization
- Stop Word Removal


In [7]:
def get_text_data_fourcateg_nd(df):
    df['textdata'] = clean_text(df['Profile name'] + ' ' + df['Description'])
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: tokenize_lemmatize_en(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    df['textdata'] = df['textdata'].apply(lambda row: remove_stopwords(row))
    df['textdata'] = df['textdata'].apply(lambda row: ' '.join(row))
    
    return df.textdata


get_text = FunctionTransformer(get_text_data_fourcateg_nd)

In [9]:
pipeline = Pipeline([
    ('selector', get_text),
    ('tfidf', TfidfVectorizer(max_df=0.25, max_features=2000, min_df=1)),
    ('lr', LogisticRegression(penalty='l2', C=0.1, max_iter=1000))
])

In [13]:
X = training_set
y = training_set['Category']
pipeline.fit(X,y)

Pipeline(steps=[('selector',
                 FunctionTransformer(func=<function get_text_data_fourcateg_nd at 0x7fef42382d30>)),
                ('tfidf', TfidfVectorizer(max_df=0.25, max_features=2000)),
                ('lr', LogisticRegression(C=0.1, max_iter=1000))])

In [14]:
filename = 'classifier_fourcateg_nd.sav'
joblib.dump(pipeline, filename)

['classifier_fourcateg_nd.sav']