In [35]:
import csv
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from collections import Counter

#### read in data

In [None]:
path = 'ICWSM-2020-Twitter-Inappropriate-Speech/hatespeech_text_label_vote_RESTRICTED_100K.csv'

In [20]:
tweet = []
label = []
with open(path) as fi:
    data = csv.reader(fi, delimiter='\t')
    for row in data:
        tweet.append(row[0])
        label.append(row[1])

In [21]:
len(tweet) == len(label)

True

#### count occurences of labels

In [79]:
Counter(label)

Counter({'spam': 14030, 'abusive': 27150, 'normal': 53851, 'hateful': 4965})

In [81]:
X_train, X_test, y_train, y_test = train_test_split(tweet, label, test_size=0.2)

#### some functions

In [83]:
def get_reports(results, report_name, y_true, y_test_predicted):
    acc = metrics.accuracy_score(y_true, y_test_predicted, normalize=True, sample_weight=None)
    report = metrics.classification_report(y_test, y_test_predicted)
    report_data = []
    lines = report.split('\n')
    for line in lines[2:]:
        row = {}
        row_data = line.split('    ')
        row_data = [item for item in row_data if len(item) > 1]
        if len(row_data) > 2:
            row['classifier'] = report_name
            row['class'] = row_data[0].strip()
            row['precision'] = float(row_data[1])
            row['recall'] = float(row_data[2])
            row['f1_score'] = float(row_data[3])
            row['support'] = float(row_data[4])
            row['accuracy'] = float(acc)
            report_data.append(row)
        
    results = results.append(pd.DataFrame(report_data), sort=True)
    
    return results

def run_pipeline(pipeline, report_name, results, 
                 y_train, X_train,
                y_test, X_test):
    
    text_clf = Pipeline(pipeline)
    text_clf = text_clf.fit(X_train, y_train)
    y_test_predicted = text_clf.predict(X_test)
    results = get_reports(results, report_name, y_test, y_test_predicted)
    
    return results

In [85]:
results = pd.DataFrame()

In [86]:
algos = {
    'LogisticRegression': LogisticRegression(), 
    'SGDClassifier':  SGDClassifier(),
    'SVC': SVC(),
    'GaussianNB' : GaussianNB(),
    'BernoulliNB' : BernoulliNB(),
    'MultinomialNB' : MultinomialNB(),
}

In [48]:
for algo_name, algo in algos.items():
    print(algo_name)
    try:
        pipeline = [('vect', CountVectorizer()),
                    ('clf', algo)]
        report_name = algo_name + '_Count'
        
        results = run_pipeline(pipeline, report_name, results, y_train, X_train, y_test, X_test) 

        pipeline = [('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', algo)]
        
        report_name = algo_name + '_tfidf'
        results = run_pipeline(pipeline, report_name, results, y_train, X_train, y_test, X_test) 
    except Exception as e:
        results = results.append(pd.DataFrame([{'classifier': algo_name, 'error': str(e)},]), sort=True)

LogisticRegression




SGDClassifier




#### print all results

In [90]:
results

#### get sorted results per classifier

In [63]:
# If the micro average < macro average one:  misclassification in the most populated labels
# if the micro average > macro average one:  misclassification in the least populated labels
results[results['class'] == 'weighted avg'].groupby(['classifier', 'class'])['precision'].max().sort_values()
results[results['class'] == 'macro avg'].groupby(['classifier', 'class'])['precision'].max().sort_values()


classifier                    class    
LogisticRegression_BoW        macro avg    0.73
SGDClassifier_BoW             macro avg    0.73
LogisticRegression_BoW_tfidf  macro avg    0.75
SGDClassifier_BoW_tfidf       macro avg    0.78
Name: precision, dtype: float64