<a href="https://colab.research.google.com/github/VictorZwart/ScriptieOffensiveLanguage/blob/main/LinearSVC/scriptie_code_linearsvc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Thesis code for LinearSVC
Author: Victor Zwart
Date: 26/05/2021


# Import
First import all the necessary packages and functions

In [21]:
import csv
import nltk.classify
import numpy as np
import os
import pandas as pd
import random
import re
import sys
import warnings
import collections
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from collections import defaultdict
from nltk.metrics import precision, recall
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from os import listdir
from os.path import isfile, join
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from nltk.corpus import stopwords, reuters
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The following code was extracted from "featx.py" Made by Toral but updated by me:

In [3]:
def bag_of_words(words):
	'''
	>>> bag_of_words(['the', 'quick', 'brown', 'fox'])
	{'quick': True, 'brown': True, 'the': True, 'fox': True}
	'''
	return dict([(word, True) for word in words])

def bag_of_words_not_in_set(words, badwords):
	'''
	>>> bag_of_words_not_in_set(['the', 'quick', 'brown', 'fox'], ['the'])
	{'quick': True, 'brown': True, 'fox': True}
	'''
	return bag_of_words(set(words) - set(badwords))

def bag_of_non_stopwords(words, stopfile='english'):
	'''
	>>> bag_of_non_stopwords(['the', 'quick', 'brown', 'fox'])
	{'quick': True, 'brown': True, 'fox': True}
	'''
	badwords = stopwords.words(stopfile)
	return bag_of_words_not_in_set(words, badwords)

def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
	'''
	>>> bag_of_bigrams_words(['the', 'quick', 'brown', 'fox'])
	{'brown': True, ('brown', 'fox'): True, ('the', 'quick'): True, 'quick': True, ('quick', 'brown'): True, 'the': True, 'fox': True}
	'''
	try:
		bigram_finder = BigramCollocationFinder.from_words(words)
		bigrams = bigram_finder.nbest(score_fn, n)
	except:
		bigrams = []
	return bag_of_words(words + bigrams)

def bag_of_bigrams_non_stopwords(words, stopfile='stopfile', score_fn=BigramAssocMeasures.chi_sq, n=200):
	'''
	>>> bag_of_bigrams_non_stopwords(['the', 'quick', 'brown', 'fox'])
	{'quick': True, ('quick', 'brown'): True, 'fox': True, ('the', 'quick'): True, ('brown', 'fox'): True, 'brown': True}
	Removes the stopwords from the unigrams but NOT from the bigrams
	'''
	badwords = stopwords.words(stopfile)
	try:
		bigram_finder = BigramCollocationFinder.from_words(words)
		bigrams = bigram_finder.nbest(score_fn, n)
	except:
		bigrams = []
	words = set(words + bigrams)
	return bag_of_words_not_in_set(words, badwords)


def bag_of_words_in_set(words, goodwords):
	return bag_of_words(set(words) & set(goodwords))

def label_feats_from_corpus(corp, feature_detector=bag_of_words):
	label_feats = collections.defaultdict(list)
	
	for label in corp.categories():
		for fileid in corp.fileids(categories=[label]):
			feats = feature_detector(corp.words(fileids=[fileid]))
			label_feats[label].append(feats)
	
	return label_feats

def split_label_feats(lfeats, split=0.75):
	train_feats = []
	test_feats = []
	
	for label, feats in lfeats.items():
		cutoff = int(len(feats) * split)
		train_feats.extend([(feat, label) for feat in feats[:cutoff]])
		test_feats.extend([(feat, label) for feat in feats[cutoff:]])
	
	return train_feats, test_feats

def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()
	
	for label, words in labelled_words:
		for word in words:
			word_fd[word] += 1
			label_word_fd[label][word] += 1
	
	n_xx = label_word_fd.N()
	high_info_words = set()
	
	for label in label_word_fd.conditions():
		n_xi = label_word_fd[label].N()
		word_scores = collections.defaultdict(int)
		
		for word, n_ii in label_word_fd[label].items():
			n_ix = word_fd[word]
			score = score_fn(n_ii, (n_ix, n_xi), n_xx)
			word_scores[word] = score
		
		bestwords = [word for word, score in word_scores.items() if score >= min_score]
		high_info_words |= set(bestwords)
	
	return high_info_words

def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
	labeled_words = []
	
	for label in reuters.categories():
		labeled_words.append((label, reuters.words(categories=[label])))
	
	return high_information_words(labeled_words, score_fn=score_fn)

def reuters_train_test_feats(feature_detector=bag_of_words):
	train_feats = []
	test_feats = []
	
	for fileid in reuters.fileids():
		if fileid.startswith('training'):
			featlist = train_feats
		else: # fileid.startswith('test')
			featlist = test_feats
		
		feats = feature_detector(reuters.words(fileid))
		labels = reuters.categories(fileid)
		featlist.append((feats, labels))
	
	return train_feats, test_feats

In [1]:
# upload data
from google.colab import files
train_csv = files.upload() # Train set
dev_csv= files.upload() # Dev set
test_csv = files.upload() # Test set
wrong_csv = files.upload() # CSV files with all wrongly annotated from the scruptie_code.ipynb

Saving train_final.csv to train_final.csv


Saving dev_final.csv to dev_final.csv


Saving test_final.csv to test_final.csv


Saving wrong.csv to wrong.csv


# Read Dataset

In [8]:
def preprocess(line):
    """
    returns line: A string containing the preprocessed line
    """
    # first lowercase:
    line = line.lower()

    # remove mentions:
    line = re.sub(r'(@\w+)', 'MENTION', line)

    # remove url's:
    line = re.sub(r'(https\S+)', 'URL', line)

    # remove all numbers
    line = re.sub(r'[0-9]+', 'NUMBER', line)

    # remove all hashtags
    line = re.sub(r'#', '', line)

    return line

In [9]:
def read_dataset(dataset_csv, genres_dict):
    """
    returns feats: a list containing tuples
    each tuple follows the structure (bag, explicitness), where bag is a bag of words dictionary

    reads the dataset csv file with the explicitness for all the tweets and puts their contents in bags of words
    """

    feats = list()
    text_list = list()
    offensiveness = ['NOT', 'EXPLICIT', 'IMPLICIT']

    with open(dataset_csv, 'r', encoding='UTF-8') as d:
        reader = csv.reader(d, delimiter='\t')
        headers = next(d)  # Skip the headers

        # Make dictionary to count how many reviews each genre has
        offensive_dict = defaultdict()

        # Go through the rows in the dataset and collect a bag of words for all the genres:
        for row in reader:
            # print(row)
            id, text, explicitness = row[0], row[1], offensiveness.index(row[6])
            text = preprocess(text)
            text_list.append(text)
            tokens = word_tokenize(text)

            # Remove punctuation from the tokens:
            punctuation = '"!?/.,()[]{}<>@#$-_=+;:' + "'"
            table = str.maketrans('', '', punctuation)
            tokens = [w.translate(table) for w in tokens]
            tokens = list(filter(None, tokens))

            # Applying a combination of taking only non-stopwords bigrams, while all words being in lowercase:
            bag = bag_of_bigrams_words(tokens)
            feats.append((bag, explicitness))

            # Increase review count for genre
            offensive_dict[explicitness] = offensive_dict.get(explicitness, 0) + 1

        for explicitness in offensive_dict:
            print("  Genre {:10} {:5} reviews".format(genres_dict[explicitness], offensive_dict[explicitness]))

    print("  Total: {} reviews read".format(len(feats)))
    # print(feats)
    return feats, text_list

In [10]:
# Initialize the genres dict and read the train, test and dev set
genres_dict = {0: "NOT", 1: "IMPLICIT", 2: "EXPLICIT"}
print("\n##### Reading training data:")
train_feats, train_text = read_dataset('train_final.csv', genres_dict)
print("\n##### Reading development data:")
dev_feats, dev_text = read_dataset('dev_final.csv', genres_dict)
print("\n##### Reading test data:")
test_feats, test_text = read_dataset('test_final.csv', genres_dict)


##### Reading training data:
  Genre EXPLICIT    1163 reviews
  Genre IMPLICIT    1425 reviews
  Genre NOT         5176 reviews
  Total: 7764 reviews read

##### Reading development data:
  Genre NOT          361 reviews
  Genre IMPLICIT     104 reviews
  Genre EXPLICIT      82 reviews
  Total: 547 reviews read

##### Reading test data:
  Genre EXPLICIT     334 reviews
  Genre IMPLICIT     702 reviews
  Genre NOT         2072 reviews
  Total: 3108 reviews read
[({'wilders': True, 'uit': True, 'kritiek': True, 'op': True, 'baudet': True, '‘': True, 'islamstandpunt': True, 'fvd': True, 'is': True, 'gevaarlijk': True, '’': True, 'URL': True, ('baudet', '‘'): True, ('fvd', 'is'): True, ('gevaarlijk', '’'): True, ('is', 'gevaarlijk'): True, ('islamstandpunt', 'fvd'): True, ('kritiek', 'op'): True, ('op', 'baudet'): True, ('uit', 'kritiek'): True, ('wilders', 'uit'): True, ('‘', 'islamstandpunt'): True, ('’', 'URL'): True}, 0), ({'MENTION': True, 'eens': True, 'al': True, 'is': True, 'het':

#Extra

In [11]:
def high_info_feats(feats, genres_dict):
    """
    returns hi_feats, a list containing tuples (bag_dict, category_string)

    makes sure the feats contain bags of words with only high info words in them
    """

    hi_feats = list()

    # Convert the formatting of our features to that required by high_information_words
    words = defaultdict(list)
    for genre in genres_dict:
        words[genre] = list()

    for feat in feats:
        genre = feat[1]
        bag = feat[0]
        for w in bag.keys():
            words[genre].append(w)

    # Calculate high information words
    labelled_words = [(genre, words[genre]) for genre in genres_dict]
    high_info_words = set(high_information_words(labelled_words, min_score=7))

    # Use the high information words to create high information features
    for feat in feats:
        category = feat[1]
        bag = feat[0]
        hi_bag = dict()
        for w in bag.keys():
            if w in high_info_words:  # ensure the words in each bag are only high info words
                hi_bag[w] = bag[w]
        hi_feats.append((hi_bag, category))  # add the new bag dict and category to the features list

    return hi_feats

In [12]:
    # Use high information words & high information feats:
    train_hifeats = high_info_feats(train_feats, genres_dict)
    dev_hifeats = high_info_feats(dev_feats, genres_dict)
    test_hifeats = high_info_feats(test_feats, genres_dict)

#Train classifier

In [13]:
    classifier = SklearnClassifier(LinearSVC(C=0.1))
    classifier._vectorizer.sort = False  # This step is necessary when working with bigrams
    classifier.train(train_feats)

<SklearnClassifier(LinearSVC(C=0.1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0))>

#Evaluate

In [14]:
def evaluation(classifier, test_feats, genres_dict):
    """
    Calculates and prints evaluation measures
    """
    print("\n##### Evaluation...")
    print("  Accuracy: %f" % nltk.classify.accuracy(classifier, test_feats))
    precisions, recalls = precision_recall(classifier, test_feats)
    f_measures = calculate_f(precisions, recalls)

    print(" |-----------|-----------|-----------|-----------|")
    print(" |%-11s|%-11s|%-11s|%-11s|" % ("genre", "precision", "recall", "F-measure"))
    print(" |-----------|-----------|-----------|-----------|")
    for genre in genres_dict:
        if precisions[genre] is None:
            print(" |%-11s|%-11s|%-11s|%-11s|" % (genres_dict[genre], "NA", "NA", "NA"))
        else:
            print(" |%-11s|%-11f|%-11f|%-11f|" % (
            genres_dict[genre], precisions[genre], recalls[genre], f_measures[genre]))
    print(" |-----------|-----------|-----------|-----------|")


def calculate_f(precisions, recalls):
    """
    Calculates and returns a dict with the f measure for each genre, using as input the precisions and recalls
    """
    f_measures = {}

    for gen in precisions:  # loop over all the genres
        p = precisions[gen]
        r = recalls[gen]
        if p is None:
            f_measures[gen] = "NA"
        elif p == 0.0 and r == 0.0:  # preventing division by zero
            f_measures[gen] = 0.0
        else:
            f_measures[gen] = (2 * (p * r)) / (p + r)

    return f_measures


def print_confusion_matrix(classifier, test_feats, genres_dict):
    """
    Prints a confusion matrix with predicted values on the X-axis and gold labels on the Y-axis
    """
    predictions = classifier.classify_many([fs for (fs, l) in test_feats])
    gold_labels = [l for (fs, l) in test_feats]

    print("\n##### Confusion matrix\nX-axis = predicted; Y-axis = gold:")
    print("\n  " + " ".join([genre[:3] for genre in genres_dict.values()]))
    print(confusion_matrix(gold_labels, predictions, labels=[int(g) for g in genres_dict.keys()]))

def precision_recall(classifier, testfeats):
    """
    Returns precisions and recalls for evaluation
    """
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    for i, (feats, label) in enumerate(testfeats):
        refsets[label].add(i)
        observed = classifier.classify(feats)
        testsets[observed].add(i)

    precisions = {}
    recalls = {}

    for label in classifier.labels():
        precisions[label] = precision(refsets[label], testsets[label])
        recalls[label] = recall(refsets[label], testsets[label])

    return precisions, recalls

In [15]:
# Print either the evaluation of the dev or the test set

# evaluation(classifier.train(train_hifeats), dev_feats, genres_dict)
# print_confusion_matrix(classifier, dev_feats, genres_dict)

evaluation(classifier.train(train_hifeats), test_feats,genres_dict)
print_confusion_matrix(classifier, test_feats, genres_dict)


##### Evaluation...
  Accuracy: 0.776705
 |-----------|-----------|-----------|-----------|
 |genre      |precision  |recall     |F-measure  |
 |-----------|-----------|-----------|-----------|
 |NOT        |0.779961   |0.976834   |0.867367   |
 |IMPLICIT   |0.847255   |0.505698   |0.633363   |
 |EXPLICIT   |0.372340   |0.104790   |0.163551   |
 |-----------|-----------|-----------|-----------|

##### Confusion matrix
X-axis = predicted; Y-axis = gold:

  NOT IMP EXP
[[2024   21   27]
 [ 315  355   32]
 [ 256   43   35]]


In [28]:
# Prints out a classification report
predictions = []
predictions = classifier.classify_many([fs for (fs, l) in test_feats])
real = [l for (fs, l) in test_feats]

print(classification_report(real, predictions))

[0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2, 1, 0, 0, 0, 0, 2, 0, 1, 0, 1, 2, 0, 0, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 

#Wrong classifications

In [None]:
def wrong_classified(classifier, feats, text_list):
    """
    Downloads a file named final_wrong.csv which contains all wrongly annotated tweets. 
    It contains: text, golden label, BiLSTM, CNN, and LinearSVC predictions
    """
    wrong_dict = {}
    with open('wrong.csv', 'r', encoding='UTF-8') as d:
        reader = csv.reader(d, delimiter='\t')
        headers = next(d)  # Skip the headers
        for row in reader:

            text, golden, bilstm, cnn, LinearSVC = row
            wrong_dict[text] = [golden, bilstm, cnn, LinearSVC]
    print(wrong_dict)


    predictions = classifier.classify_many([fs for (fs, l) in feats])
    gold_labels = [l for (fs, l) in feats]
    index = 0
    for pred, real in zip(predictions, gold_labels):
        if pred != real:
            text = text_list[index]
            if text in wrong_dict: # If the text is already in the dictionary
                other_preds = wrong_dict[text]
                other_preds[3] = pred # Linearsvc is on index 3
                wrong_dict[text] = other_preds
            else: # if the key doesnt exist yet then the other classifiers got it right.
                wrong_dict[text] = [real, real, real, pred]
            # print(text[index], pred, real)

        index += 1

    fields = ["text", "golden", 'bilstm', 'cnn', 'LinearSVC']
    headers = {'text','golden','bilstm', 'cnn', 'LinearSVC' }
    with open('final_wrong.csv', 'w') as final_file:
        writer = csv.writer(final_file, delimiter='\t')
        writer.writerow(fields)
        for text, pred in wrong_dict.items():
            row = [text, pred[0], pred[1], pred[2], pred[3]]
            print(row)
            writer.writerow(row)
    files.download('final_wrong.csv')   

In [None]:
wrong_classified(classifier, dev_feats, dev_text)

{'slotdebat wat een drama dat groenlinks': ['1', '0', '0', '1'], 'MENTION MENTION de criminele oorlogsmisdadige nato had op NUMBER maart NUMBER (één dag voor de illegale bombardementen op servië) al opgeheven moeten worden. URL': ['1', '0', '2', '1'], 'MENTION MENTION MENTION vind het heel dom als mensen altijd denken dat blank en donkere mensen haten .ikzelf heb dam in thailand texas rusland australië indonesië want heb negers in men familie en klikkers en vrienden uit marokko en turkye. schuld is silvana simons en die zwarten piet hater': ['1', '0', '2', '1'], 'die betalen niks, dat betaald de staat wel, MENTION en MENTION doen alles om de nederlanders te verdrijven en door armoede om te komen, maar miljarden naar dit soort idiote dingen. MENTION MENTION MENTION nederland is van de nederlanders! 🆘🇳🇱🌳🆘': ['2', '0', '0', '2'], 'MENTION goh.. moslims hebben het er maar druk mee om iedereen te bestrijden.': ['2', '0', '0', '2'], 'je zal toch gezellig naar een concert gaan.. godsallemacht

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>