In [1]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import os
from config import *
from data_loader import *

from torchtext import data
from torchtext import datasets as nlp_dset
import nltk
from torchtext.vocab import Vectors
import random
from torch.nn import init
import torch.nn as nn
import torch


In [2]:
SEED = 2000
embedding_dim = 200

### Loading Data

In [3]:
dl = Data_Loader()
train_df = dl.get_data()
# dl.save_csv()

  self.dataframe = pd.DataFrame.from_csv(file, sep='\t', header=0)
[nltk_data] Downloading package punkt to /Users/shawngung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shawngung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def transfrom_for_scikit(task_header, text_field, label_field, embedding, train):
    """
    task_header is one of subtask_a, subtask_b, subtask_c
    """
    tokenised_train = [example.cleaned_s for example in train]
    labels = np.array(
      label_field.process(
          [getattr(example, task_header) for example in train]
      )
    )

    word_idxs = text_field.process(tokenised_train)
    embeddings = torch.mean(embedding(word_idxs).detach(), dim=1)
    return embeddings.numpy(), labels

### Task A

In [5]:
#Create fields
BATCH_SIZE = 128
fix_length = 48
TEXT = data.Field(
    sequential=True, use_vocab=True, lower=True,
    tokenize=nltk.word_tokenize, batch_first=True,
    is_target=False, fix_length=fix_length)

LABEL = data.LabelField(sequential=False, use_vocab=True, batch_first = True,is_target=True)
ID = data.LabelField(sequential=False, use_vocab=False, batch_first=True)

data_fields = {
                "cleaned_s": ('cleaned_s', TEXT),
                'subtask_a': ('subtask_a',LABEL),
                'subtask_b': ('subtask_b',LABEL),
                'subtask_c': ('subtask_c',LABEL),
              }


train = data.TabularDataset(os.path.join(DATA_DIR,PROCESSED_DATA_FILE), format='csv', fields = 
                            data_fields)

train, valid = train.split(split_ratio=0.9, random_state=random.seed(SEED))

print(f'Train size: {len(train)}')
print(f'Validation size: {len(valid)}')

vectors = Vectors(name='glove.6B.200d.txt', cache=GLOVE_DIR)
#Now build vocab (using only the training set)
TEXT.build_vocab(train, vectors=vectors) #USE "glove.840B.300d" or glove.twitter.27B.200d


LABEL.build_vocab(train.subtask_a)

output_dim = len(LABEL.vocab)

#Create iterators
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid),
                        batch_sizes=(BATCH_SIZE, len(valid)),  
                        sort_key=lambda x: len(x.cleaned_s))

Train size: 11916
Validation size: 1324


In [6]:
print('first tweet :', train[100].cleaned_s)
print('first label :', train[100].subtask_a)
# print(TEXT.vocab.stoi) # word to index
# print(LABEL.vocab.stoi) # label to index

first tweet : ['thats', 'call', 'gun', 'control']
first label : NOT


In [7]:
embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors) # copies pre-trained word vectors

training_embeddings, training_labels = transfrom_for_scikit('subtask_a', TEXT, LABEL, embedding, train)

In [8]:
from sklearn import svm
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=None, class_weight={1: 2})
clf.fit(training_embeddings, training_labels)



SGDClassifier(alpha=0.001, average=False, class_weight={1: 2},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=100,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
val_embeddings, val_labels = transfrom_for_scikit('subtask_a', TEXT, LABEL, embedding, valid)

In [10]:
preds = clf.predict(val_embeddings)
target_names = ['NOT OFFENSIVE','OFFENSIVE']
print(metrics.confusion_matrix(val_labels, preds))
print(metrics.classification_report(val_labels, preds,target_names = target_names))
print("Accuracy:", metrics.accuracy_score(val_labels, preds))

[[753 152]
 [178 241]]
               precision    recall  f1-score   support

NOT OFFENSIVE       0.81      0.83      0.82       905
    OFFENSIVE       0.61      0.58      0.59       419

    micro avg       0.75      0.75      0.75      1324
    macro avg       0.71      0.70      0.71      1324
 weighted avg       0.75      0.75      0.75      1324

Accuracy: 0.7507552870090635


### Task B

In [11]:
#Create fields
BATCH_SIZE = 128
fix_length = 48
TEXT = data.Field(
    sequential=True, use_vocab=True, lower=True,
    tokenize=nltk.word_tokenize, batch_first=True,
    is_target=False, fix_length=fix_length)

LABEL = data.LabelField(sequential=False, use_vocab=True, batch_first = True,is_target=True)

data_fields = {
                "cleaned_s": ('cleaned_s', TEXT),
                'subtask_a': ('subtask_a',LABEL),
                'subtask_b': ('subtask_b',LABEL),
              }


train = data.TabularDataset(os.path.join(DATA_DIR,PROCESSED_DATA_FILE),
                            format='csv',
                            fields=data_fields,
                            filter_pred=lambda d: d.subtask_a == 'OFF')

train, valid = train.split(split_ratio=0.8, random_state=random.seed(SEED))

print(f'Train size: {len(train)}')
print(f'Validation size: {len(valid)}')

#Now build vocab (using only the training set)
# This is where tokenization is performed on train

vectors = Vectors(name='glove.6B.200d.txt', cache=GLOVE_DIR)
#Now build vocab (using only the training set)
TEXT.build_vocab(train, vectors=vectors) #USE "glove.840B.300d" or glove.twitter.27B.200d
LABEL.build_vocab(train.subtask_b)

output_dim = len(LABEL.vocab)

print(LABEL.vocab.stoi)

#Create iterators
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid),
                        batch_sizes=(BATCH_SIZE, len(valid)),  
                        sort_key=lambda x: len(x.tweet))

Train size: 3520
Validation size: 880
defaultdict(<function _default_unk_index at 0x1a224f2048>, {'TIN': 0, 'UNT': 1})


In [12]:
embedding_dim = 200
embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors) # copies pre-trained word vectors

training_embeddings, training_labels = transfrom_for_scikit('subtask_b', TEXT, LABEL, embedding, train)

In [13]:
from sklearn import svm
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=None, class_weight={1: 6.8})
clf.fit(training_embeddings, training_labels)



SGDClassifier(alpha=0.001, average=False, class_weight={1: 6.8},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=100,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [14]:
val_embeddings, val_labels = transfrom_for_scikit('subtask_b', TEXT, LABEL, embedding, valid)

In [15]:
preds = clf.predict(val_embeddings)
target_names = ['TARGET','UNTARGET']
print(metrics.confusion_matrix(val_labels, preds))
print(metrics.classification_report(val_labels, preds,target_names=target_names))
print("Accuracy:", metrics.accuracy_score(val_labels, preds))

[[396 389]
 [ 23  72]]
              precision    recall  f1-score   support

      TARGET       0.95      0.50      0.66       785
    UNTARGET       0.16      0.76      0.26        95

   micro avg       0.53      0.53      0.53       880
   macro avg       0.55      0.63      0.46       880
weighted avg       0.86      0.53      0.61       880

Accuracy: 0.5318181818181819


### Task C

In [16]:
#Create fields
BATCH_SIZE = 128
TEXT = data.Field(
    sequential=True, use_vocab=True, lower=True,
    tokenize=nltk.word_tokenize, batch_first=True,
    is_target=False)

LABEL = data.LabelField(sequential=False, use_vocab=True, batch_first = True,is_target=True)


data_fields = {
                "cleaned_s": ('cleaned_s', TEXT),
                'subtask_a': ('subtask_a',LABEL),
                'subtask_b': ('subtask_b',LABEL),
                'subtask_c': ('subtask_c',LABEL)
              }

train = data.TabularDataset(os.path.join(DATA_DIR,PROCESSED_DATA_FILE),
                            format='csv',
                            fields=data_fields,
                            filter_pred=lambda d: d.subtask_a == 'OFF' and d.subtask_b == 'TIN')

train, valid = train.split(split_ratio=0.8, random_state=random.seed(SEED))

print(f'Train size: {len(train)}')
print(f'Validation size: {len(valid)}')

#Now build vocab (using only the training set)

vectors = Vectors(name='glove.6B.200d.txt', cache=GLOVE_DIR)
#Now build vocab (using only the training set)
TEXT.build_vocab(train, vectors=vectors) #USE "glove.840B.300d" or glove.twitter.27B.200d
LABEL.build_vocab(train.subtask_c)

output_dim = len(LABEL.vocab)

print(LABEL.vocab.stoi)

#Create iterators
train_iterator, valid_iterator = data.BucketIterator.splits((train, valid),
                        batch_sizes=(BATCH_SIZE, len(valid)),  
                        sort_key=lambda x: len(x.tweet))

Train size: 3101
Validation size: 775
defaultdict(<function _default_unk_index at 0x1a224f2048>, {'IND': 0, 'GRP': 1, 'OTH': 2})


In [17]:
embedding = nn.Embedding(len(TEXT.vocab), embedding_dim)
embedding.weight.data.copy_(TEXT.vocab.vectors) # copies pre-trained word vectors

embeddings, training_labels = transfrom_for_scikit('subtask_c', TEXT, LABEL, embedding, train)

In [18]:
val_embeddings, val_labels = transfrom_for_scikit('subtask_c', TEXT, LABEL, embedding, valid)


In [19]:
from sklearn import svm
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=None, class_weight={0:1.6, 1:3.7, 2:8.4})

clf.fit(embeddings, training_labels)



SGDClassifier(alpha=0.001, average=False,
       class_weight={0: 1.6, 1: 3.7, 2: 8.4}, early_stopping=False,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=100, n_iter=None,
       n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
       random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [20]:
preds = clf.predict(val_embeddings)
target_names = ['INDIVIDUAL','GROUP','OTHER']

print(metrics.confusion_matrix(val_labels, preds))
print(metrics.classification_report(val_labels, preds,target_names= target_names))
print("Accuracy:", metrics.accuracy_score(val_labels, preds))

[[382  55  11]
 [105 104  26]
 [ 50  27  15]]
              precision    recall  f1-score   support

  INDIVIDUAL       0.71      0.85      0.78       448
       GROUP       0.56      0.44      0.49       235
       OTHER       0.29      0.16      0.21        92

   micro avg       0.65      0.65      0.65       775
   macro avg       0.52      0.49      0.49       775
weighted avg       0.62      0.65      0.62       775

Accuracy: 0.6464516129032258
