In [1]:
import csv
import pandas as pd
from sklearn import metrics
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.svm import SVC, LinearSVC # Last one performs better for high number of samples
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter

#### read in data

In [2]:
path = 'hatespeech_text_label_vote_RESTRICTED_100K.csv'

In [3]:
tweet = []
label = []
with open(path) as fi:
    data = csv.reader(fi, delimiter='\t')
    for row in data:
        tweet.append(row[0])
        label.append(row[1])

In [4]:
len(tweet) == len(label)

True

#### count occurences of labels

In [5]:
Counter(label)

Counter({'abusive': 27150, 'hateful': 4965, 'normal': 53851, 'spam': 14030})

In [6]:
X_train, X_test, y_train, y_test = train_test_split(tweet, label, test_size=0.2, random_state=2020)

#### some functions

In [6]:
def get_reports(results, report_name, y_true, y_test_predicted):
    acc = metrics.accuracy_score(y_true, y_test_predicted, normalize=True, sample_weight=None)
    report = metrics.classification_report(y_test, y_test_predicted)
    report_data = []
    lines = report.split('\n')
    for line in lines[2:]:
        row = {}
        row_data = line.split('    ')
        row_data = [item for item in row_data if len(item) > 1]
        if len(row_data) > 2:
            row['classifier'] = report_name
            row['class'] = row_data[0].strip()
            row['precision'] = float(row_data[1])
            row['recall'] = float(row_data[2])
            row['f1_score'] = float(row_data[3])
            row['support'] = float(row_data[4])
            row['accuracy'] = float(acc)
            report_data.append(row)
        
    results = results.append(pd.DataFrame(report_data), sort=True)
    
    return results

def run_pipeline(pipeline, report_name, results, 
                 y_train, X_train,
                y_test, X_test):
    
    text_clf = Pipeline(pipeline)
    text_clf = text_clf.fit(X_train, y_train)
    y_test_predicted = text_clf.predict(X_test)
    results = get_reports(results, report_name, y_test, y_test_predicted)
    
    return results

In [7]:
results = pd.DataFrame()

In [9]:
algos = {
    'LogisticRegression': LogisticRegression(), 
    'SGDClassifier':  SGDClassifier(),
    'SVC': SVC(),
    'GaussianNB' : GaussianNB(),
    'BernoulliNB' : BernoulliNB(),
    'MultinomialNB' : MultinomialNB(),
}

In [10]:
for algo_name, algo in algos.items():
    print(algo_name)
    try:
        pipeline = [('vect', CountVectorizer()),
                    ('clf', algo)]
        report_name = algo_name + '_Count'
        
        results = run_pipeline(pipeline, report_name, results, y_train, X_train, y_test, X_test) 

        pipeline = [('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', algo)]
        
        report_name = algo_name + '_tfidf'
        results = run_pipeline(pipeline, report_name, results, y_train, X_train, y_test, X_test) 
    except Exception as e:
        results = results.append(pd.DataFrame([{'classifier': algo_name, 'error': str(e)},]), sort=True)

GaussianNB
MultinomialNB
SVC


  'precision', 'predicted', average, warn_for)


BernoulliNB


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


LogisticRegression




SGDClassifier




#### print all results

In [11]:
results

Unnamed: 0,accuracy,class,classifier,error,f1_score,precision,recall,support
0,,,GaussianNB,"A sparse matrix was passed, but dense data is ...",,,,
0,0.77470,abusive,MultinomialNB_Count,,0.84,0.81,0.86,5438.0
1,0.77470,hateful,MultinomialNB_Count,,0.10,0.76,0.05,1023.0
2,0.77470,normal,MultinomialNB_Count,,0.84,0.78,0.91,10746.0
3,0.77470,spam,MultinomialNB_Count,,0.45,0.63,0.35,2793.0
4,0.77470,micro avg,MultinomialNB_Count,,0.77,0.77,0.77,20000.0
5,0.77470,macro avg,MultinomialNB_Count,,0.56,0.74,0.54,20000.0
6,0.77470,weighted avg,MultinomialNB_Count,,0.75,0.76,0.77,20000.0
0,0.69840,abusive,MultinomialNB_tfidf,,0.70,0.83,0.60,5438.0
1,0.69840,hateful,MultinomialNB_tfidf,,0.06,0.89,0.03,1023.0


#### get sorted results per classifier

In [13]:
# If the micro average < macro average one:  misclassification in the most populated labels
# if the micro average > macro average one:  misclassification in the least populated labels
results[results['class'] == 'weighted avg'].groupby(['classifier', 'class'])['precision'].max().sort_values()

classifier                class       
SVC_Count                 weighted avg    0.29
SVC_tfidf                 weighted avg    0.29
BernoulliNB_Count         weighted avg    0.72
BernoulliNB_tfidf         weighted avg    0.72
MultinomialNB_tfidf       weighted avg    0.73
MultinomialNB_Count       weighted avg    0.76
LogisticRegression_Count  weighted avg    0.79
SGDClassifier_Count       weighted avg    0.79
SGDClassifier_tfidf       weighted avg    0.79
LogisticRegression_tfidf  weighted avg    0.80
Name: precision, dtype: float64

In [14]:
results[results['class'] == 'macro avg'].groupby(['classifier', 'class'])['precision'].max().sort_values()

classifier                class    
SVC_Count                 macro avg    0.13
SVC_tfidf                 macro avg    0.13
BernoulliNB_Count         macro avg    0.56
BernoulliNB_tfidf         macro avg    0.56
LogisticRegression_Count  macro avg    0.73
MultinomialNB_Count       macro avg    0.74
LogisticRegression_tfidf  macro avg    0.76
SGDClassifier_Count       macro avg    0.76
MultinomialNB_tfidf       macro avg    0.78
SGDClassifier_tfidf       macro avg    0.80
Name: precision, dtype: float64

In [15]:
results[results['class'] == 'micro avg'].groupby(['classifier', 'class'])['precision'].max().sort_values()

classifier                class    
SVC_Count                 micro avg    0.54
SVC_tfidf                 micro avg    0.54
MultinomialNB_tfidf       micro avg    0.70
BernoulliNB_Count         micro avg    0.77
BernoulliNB_tfidf         micro avg    0.77
MultinomialNB_Count       micro avg    0.77
SGDClassifier_tfidf       micro avg    0.79
LogisticRegression_Count  micro avg    0.81
LogisticRegression_tfidf  micro avg    0.81
SGDClassifier_Count       micro avg    0.81
Name: precision, dtype: float64

In [16]:
results

Unnamed: 0,accuracy,class,classifier,error,f1_score,precision,recall,support
0,,,GaussianNB,"A sparse matrix was passed, but dense data is ...",,,,
0,0.77470,abusive,MultinomialNB_Count,,0.84,0.81,0.86,5438.0
1,0.77470,hateful,MultinomialNB_Count,,0.10,0.76,0.05,1023.0
2,0.77470,normal,MultinomialNB_Count,,0.84,0.78,0.91,10746.0
3,0.77470,spam,MultinomialNB_Count,,0.45,0.63,0.35,2793.0
4,0.77470,micro avg,MultinomialNB_Count,,0.77,0.77,0.77,20000.0
5,0.77470,macro avg,MultinomialNB_Count,,0.56,0.74,0.54,20000.0
6,0.77470,weighted avg,MultinomialNB_Count,,0.75,0.76,0.77,20000.0
0,0.69840,abusive,MultinomialNB_tfidf,,0.70,0.83,0.60,5438.0
1,0.69840,hateful,MultinomialNB_tfidf,,0.06,0.89,0.03,1023.0


# Check why the SVC performed so crappy

Maybe we chose the wrong kernel?

- SVC uses a rbf kernel by default
- We could have set `kernel="linear`
- But that still takes a long time to run, therefore let's used an optimalized one, LinearSVC(), instead (see https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC )

In [7]:
vec = CountVectorizer(min_df=5, max_df=.5)
X_features_train = vec.fit_transform(X_train)
X_features_test = vec.transform(X_test)

print("Start!")
myclf = LinearSVC()
myclf.fit(X_features_train, y_train)
y_pred = myclf.predict(X_features_test)



Start!




In [9]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     abusive       0.85      0.89      0.87      5438
     hateful       0.46      0.30      0.36      1023
      normal       0.81      0.86      0.84     10746
        spam       0.51      0.41      0.45      2793

   micro avg       0.78      0.78      0.78     20000
   macro avg       0.66      0.61      0.63     20000
weighted avg       0.76      0.78      0.77     20000

