In [1]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import os
from config import *
from data_loader import *
import nltk

### loading data

In [2]:
dl = Data_Loader()
df = dl.get_data()
df.head(5)

  self.dataframe = pd.DataFrame.from_csv(file, sep='\t', header=0)
[nltk_data] Downloading package punkt to /Users/shawngung/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shawngung/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0_level_0,tweet,subtask_a,subtask_b,subtask_c,cleaned_tweet,stemmed_tweet,cleaned_s
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
86426,@USER She should ask a few native Americans wh...,OFF,UNT,,"[ask, native, americans, take]","[ask, nativ, american, take]",ask native americans take
90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND,"[go, home, drunk, maga]","[go, home, drunk, maga]",go home drunk maga
16820,Amazon is investigating Chinese employees who ...,NOT,,,"[amazon, investigating, chinese, employees, se...","[amazon, investig, chines, employe, sell, inte...",amazon investigating chinese employees selling...
62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,,"[someone, havetaken, piece, shit, volcano]","[someon, havetaken, piec, shit, volcano]",someone havetaken piece shit volcano
43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,,"[obama, wanted, liberals, amp, illegals, move,...","[obama, want, liber, amp, illeg, move, red, st...",obama wanted liberals amp illegals move red st...


In [3]:
total = df.shape[0]

### Task A

In [4]:
training_percent = 0.8
training_size = int(training_percent * total)
validation_size = total - training_size

corpus = df['cleaned_s'].values.copy()
labels = df['subtask_a'].values.copy()
labels[labels == 'OFF'] = 1
labels[labels == 'NOT'] = 0

labels = labels.astype(float)

indices = list(range(total))
np.random.shuffle(indices)
training_sents = corpus[indices[:training_size]]
training_labels = labels[indices[:training_size]]

validation_sents = corpus[indices[training_size:]]
validation_labels = labels[indices[training_size:]]

In [5]:
vectorizer = CountVectorizer(stop_words='english')
vec_training = vectorizer.fit_transform(training_sents)

tf_transformer = TfidfTransformer(use_idf=True)
vec_training = tf_transformer.fit_transform(vec_training)

In [6]:
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={1.0: 2})

clf.fit(vec_training, training_labels)



SGDClassifier(alpha=0.001, average=False, class_weight={1.0: 2},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [7]:
vec_valid = tf_transformer.transform(vectorizer.transform(validation_sents))
predictions = clf.predict(vec_valid)

In [8]:
target_names = ['NOT OFFENSIVE','OFFENSIVE']
print(metrics.confusion_matrix(validation_labels, predictions))
print(metrics.classification_report(validation_labels, predictions,target_names = target_names))
print("Accuracy:", metrics.accuracy_score(validation_labels, predictions))

[[1669   53]
 [ 656  270]]
               precision    recall  f1-score   support

NOT OFFENSIVE       0.72      0.97      0.82      1722
    OFFENSIVE       0.84      0.29      0.43       926

    micro avg       0.73      0.73      0.73      2648
    macro avg       0.78      0.63      0.63      2648
 weighted avg       0.76      0.73      0.69      2648

Accuracy: 0.7322507552870091


### Task B

In [9]:
total_b = df.count()['subtask_b'].item()
training_percent = 0.8
training_size = int(training_percent * total_b)
validation_size = total_b - training_size

train_b = df[df.subtask_a == 'OFF']
corpus = train_b['cleaned_s'].values.copy()
labels = train_b['subtask_b'].values.copy()
labels[labels == 'TIN'] = 0
labels[labels == 'UNT'] = 1
labels = labels.astype(float)

indices = list(range(total_b))
np.random.shuffle(indices)
training_sents = corpus[indices[:training_size]]
training_labels = labels[indices[:training_size]]

validation_sents = corpus[indices[training_size:]]
validation_labels = labels[indices[training_size:]]

In [10]:
vectorizer = CountVectorizer(stop_words='english')
vec_training = vectorizer.fit_transform(training_sents)

tf_transformer = TfidfTransformer(use_idf=True)
vec_training = tf_transformer.fit_transform(vec_training)

In [11]:
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={1.0: 6.8})

clf.fit(vec_training, training_labels)

SGDClassifier(alpha=0.001, average=False, class_weight={1.0: 6.8},
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l1',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
vec_valid = tf_transformer.transform(vectorizer.transform(validation_sents))
predictions = clf.predict(vec_valid)

In [13]:
target_names = ['TARGET','UNTARGET']
print(metrics.confusion_matrix(validation_labels, predictions))
print(metrics.classification_report(validation_labels, predictions,target_names=target_names))
print("Accuracy:", metrics.accuracy_score(validation_labels, predictions))

[[631 129]
 [ 77  43]]
              precision    recall  f1-score   support

      TARGET       0.89      0.83      0.86       760
    UNTARGET       0.25      0.36      0.29       120

   micro avg       0.77      0.77      0.77       880
   macro avg       0.57      0.59      0.58       880
weighted avg       0.80      0.77      0.78       880

Accuracy: 0.7659090909090909


### Task C

In [14]:
total_c = df.count()['subtask_c'].item()
training_percent = 0.8
training_size = int(training_percent * total_c)
validation_size = total_c - training_size

train_c = df[df.subtask_a == 'OFF'][df.subtask_b == 'TIN']
print("Size of dataset", len(train_c))
corpus = train_c['cleaned_s'].values.copy()
labels = train_c['subtask_c'].values.copy()
labels[labels == 'IND'] = 0
labels[labels == 'GRP'] = 1
labels[labels == 'OTH'] = 2
labels = labels.astype(float)

indices = list(range(total_c))
np.random.shuffle(indices)
training_sents = corpus[indices[:training_size]]
training_labels = labels[indices[:training_size]]

validation_sents = corpus[indices[training_size:]]
validation_labels = labels[indices[training_size:]]

Size of dataset 3876


  


In [15]:
vectorizer = CountVectorizer(stop_words='english')
vec_training = vectorizer.fit_transform(training_sents)

print(vec_training.shape)

tf_transformer = TfidfTransformer(use_idf=True)
vec_training = tf_transformer.fit_transform(vec_training)

(3100, 8302)


In [16]:
clf = SGDClassifier(loss='hinge', penalty='l1',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None, class_weight={0:1.6, 1:3.7, 2:8.4})

clf.fit(vec_training, training_labels)

SGDClassifier(alpha=0.001, average=False,
       class_weight={0: 1.6, 1: 3.7, 2: 8.4}, early_stopping=False,
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_iter_no_change=5, n_jobs=None, penalty='l1', power_t=0.5,
       random_state=42, shuffle=True, tol=None, validation_fraction=0.1,
       verbose=0, warm_start=False)

In [17]:
vec_valid = tf_transformer.transform(vectorizer.transform(validation_sents))
predictions = clf.predict(vec_valid)

In [18]:
target_names = ['INDIVIDUAL','GROUP','OTHER']
print(metrics.confusion_matrix(validation_labels, predictions))
print(metrics.classification_report(validation_labels, predictions,target_names= target_names))
print("Accuracy:", metrics.accuracy_score(validation_labels, predictions))

[[358  54  47]
 [ 87 132  24]
 [ 50  16   8]]
              precision    recall  f1-score   support

  INDIVIDUAL       0.72      0.78      0.75       459
       GROUP       0.65      0.54      0.59       243
       OTHER       0.10      0.11      0.10        74

   micro avg       0.64      0.64      0.64       776
   macro avg       0.49      0.48      0.48       776
weighted avg       0.64      0.64      0.64       776

Accuracy: 0.6417525773195877
