In [None]:
import json
import numpy as np
import pandas as pd
#import tensorflow as tf
import string
import re
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Extract data

In [None]:
filename = 'train.json'
json_file = open('train.json') 
train_data = json.load(json_file)
filename = 'dev.json'
json_file = open('dev.json') 
dev_data = json.load(json_file)

filename = 'test-unlabelled.json'
json_file = open(filename) 
test_data = json.load(json_file)

filename = 'dev-baseline-r.json'
json_file = open(filename) 
dev_baseline_data = json.load(json_file)

filename = 'test-output.json'
json_file = open(filename) 
result_example = json.load(json_file)
json_file.close()

def extract_data(data):
    text_list = []
    label_list = []
    for event in data:
        text_list.append(data[event]['text'])
        if 'label' in data[event].keys():
            label_list.append(data[event]['label'])

    return text_list,label_list

train_text_list, train_label_list = extract_data(train_data)
dev_text_list, dev_label_list = extract_data(dev_data)
test_text_list, test_label_list = extract_data(test_data)

# preprocessing

In [None]:
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from nltk.corpus import wordnet
from nltk.tag import pos_tag
from nltk import ne_chunk
from nltk.stem import WordNetLemmatizer 
from nltk.tree import Tree
from nltk.stem import PorterStemmer 

def get_lemmatized(string):
    lemmatizer = WordNetLemmatizer()
    tag = pos_tag([string])[0][1]
    wordnet_tag = get_wordnet_pos(tag)
    result = lemmatizer.lemmatize(string,wordnet_tag)
    return result

def get_wordnet_pos(tag):
    
    """Map treebank tag to wordnet tag"""
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag[0], wordnet.NOUN)

def is_number(n):
    try:
        float(n)   
    except ValueError:
        return False
    return True


def preprocess_bow(train_text_list):
    lemmatizer = WordNetLemmatizer()
    #ps = PorterStemmer() 
    punctuation_sets = set("\"\"#‘’$“”%&()+-./ :`\'\';<=>@[\],，^_`{|}~+")
    stopwords_set = set(stopwords.words('english'))
    preprocessed_train_text = []
    bow_train_text = []
    
    for text in train_text_list:
        text = re.sub('@[^\s]+','',text)
        text = re.sub(r'\d+', '', text)
        text = re.sub(r"http\S+", "",text)
        text = word_tokenize(text)
        text_no_sword = [n for n in text if n.lower() not in stopwords_set|punctuation_sets ]
        #print(text_no_sword)
        chunk = ne_chunk(pos_tag(text_no_sword))
        text_list = []
        bow_event= defaultdict(int)
        for n in chunk:
           # print('org n',n)
            if type(n) == Tree:
                #n = n.label()
                continue
            #elif is_number(n[0]):
                #n = 'number'
            #    continue
            #elif n[0][-1].lower() == 'c' or n[0][-1].lower() == 'f':
            #    n = 'temperature'
            elif len(n[0]) <= 2 and n[0] not in '?!':
                continue 
            elif len(n[0]) > 12:
                continue
            else:
                #n = ps.stem(n[0].lower())
                n = get_lemmatized(n[0].lower().strip())
                
            n =  re.sub(r"[']", '', n)
            text_list.append(n)
            bow_event[n] += 1
            
        bow_train_text.append(bow_event)
        preprocessed_train_text.append(text_list)

    return bow_train_text,preprocessed_train_text

# preprocessing and save result to pickle

from tqdm import tqdm
import pickle

bow_train_text,preprocessed_train_text = preprocess_bow(train_text_list)
bow_dev_text,preprocessed_dev_text = preprocess_bow(dev_text_list)
bow_test_text,preprocessed_test_text = preprocess_bow(test_text_list)


with open('preprocessed_train_text.pickle', 'wb') as handle:
    pickle.dump(preprocessed_train_text, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('bow_train_text.pickle', 'wb') as handle:
    pickle.dump(bow_train_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('preprocessed_dev_text.pickle', 'wb') as handle:
    pickle.dump(preprocessed_dev_text, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('bow_dev_text.pickle', 'wb') as handle:
    pickle.dump(bow_dev_text, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('preprocessed_test_text.pickle', 'wb') as handle:
    pickle.dump(preprocessed_test_text, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('bow_test_text.pickle', 'wb') as handle:
    pickle.dump(bow_test_text, handle, protocol=pickle.HIGHEST_PROTOCOL)  

# import preprocessed data from pickle 

In [None]:
import pickle
with open('preprocessed_train_text.pickle', 'rb') as handle:
    preprocessed_train_text = pickle.load(handle)
with open('bow_train_text.pickle', 'rb') as handle:
    bow_train_text = pickle.load(handle)
print(len(preprocessed_train_text))
with open('preprocessed_dev_text.pickle', 'rb') as handle:
    preprocessed_dev_text = pickle.load(handle)
with open('bow_dev_text.pickle', 'rb') as handle:
    bow_dev_text = pickle.load(handle)
print(len(preprocessed_dev_text))

with open('preprocessed_test_text.pickle', 'rb') as handle:
    preprocessed_test_text = pickle.load(handle)
with open('bow_test_text.pickle', 'rb') as handle:
    bow_test_text = pickle.load(handle)
print(len(preprocessed_test_text))


# word cloud 

all_word_mis = ''
all_word_nonmis = ''
non_ind = [i for i,x in enumerate(dev_label_list) if x == 0]
non_dex_text = [preprocessed_dev_text[i] for i in non_ind]
for doc in preprocessed_train_text:
    all_word_mis += ' '.join(doc)
for doc in non_dex_text:
    all_word_nonmis += ' '.join(doc)
    

non_ind = [i for i,x in enumerate(dev_label_list) if x == 0]
non_dex_text_raw = [dev_text_list[i] for i in non_ind]

from wordcloud import WordCloud
import matplotlib.pyplot as plt

cloud = WordCloud(max_font_size=60).generate(all_word_mis)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

from wordcloud import WordCloud
import matplotlib.pyplot as plt

cloud = WordCloud(max_font_size=60).generate(all_word_nonmis)
plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# BOW vectorization

In [None]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
train_set = vectorizer.fit_transform(bow_train_text)
dev_set = vectorizer.transform(bow_dev_text)
test_set = vectorizer.transform(bow_test_text)
print('train set dimension',train_set.shape)
print('dev set dimension',dev_set.shape)
print('test set dimension',test_set.shape)

# one class SVM

In [None]:
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score, classification_report, f1_score
def train_ocsvm(train_set_d2v,dev_set_d2v,nu=0.3):
    svm_clf = OneClassSVM(kernel="rbf", degree=1, gamma='auto', coef0=0.0,
                           tol=0.0000001, nu=nu, shrinking=True, cache_size=200, 
                           max_iter=-1)
    svm_clf.fit(train_set_d2v)
    dev_predict = svm_clf.predict(dev_set_d2v)
    dev_predict[dev_predict == -1] = 0
    print(accuracy_score(dev_label_list,dev_predict))
    print(f1_score(dev_label_list, dev_predict, labels=None, pos_label=1, average='binary'))
    print(classification_report(dev_label_list,dev_predict))
    return svm_clf 
svm_clf = train_ocsvm(train_set,dev_set)

# parameter tune on nu

In [None]:
acc_list_svm = []
prev_acc= 0 
for alpha in range(25,30):
    svm_clf = OneClassSVM(kernel="rbf", degree=1, gamma='auto', coef0=0.0,
                       tol=0.0000001, nu=alpha/100, shrinking=True, cache_size=200, 
                       max_iter=-1)
    svm_clf.fit(train_set)
    dev_predict = svm_clf.predict(dev_set)
    dev_predict[dev_predict == -1] = 0
    acc = f1_score(dev_label_list, dev_predict, labels=None, pos_label= 1, average='binary')
    acc_list_svm.append(acc)
    if acc > prev_acc:
        prev_acc = acc
        param = alpha/100
print(acc_list_svm)
print(param)

# PCA

In [None]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(350)
train_set_svd = svd.fit_transform(train_set)
dev_set_svd = svd.transform(dev_set)
test_set_svd = svd.transform(test_set)
print(svd.explained_variance_ratio_.sum())
svm_clf = train_ocsvm(train_set_svd,dev_set_svd)

# export result 

In [None]:
def write_result_json(model,test_set_svd):
    test_predict = model.predict(test_set_svd)
    test_predict[test_predict == -1] = 0
    print(sum(test_predict))
    result = dict()
    i = 0
    for n in test_data:
        result[n] = {'label': int(test_predict[i])}
        i+=1
    with open('test-output.json', 'w') as file:
        json.dump(result, file)

write_result_json(svm_clf,test_set_svd)

# Error Analysis 

In [None]:
dev_pred = svm_clf.predict(dev_set)
dev_pred[dev_pred == -1] = 0
error_ind = [i for i in range(100) if dev_pred[i] != dev_label_list[i]]

label1_dev = [i for i in error_ind if dev_label_list[i] ==1] #label真实为1 的被错误分到了 label 0
label0_dev = [i for i in error_ind if dev_label_list[i] ==0] #label真实为0 的被错误分到了 label 1

error_text_l1 = [dev_text_list[i] for i in range(100) if i in label1_dev] 
error_text_l0 = [dev_text_list[i] for i in range(100) if i in label0_dev]
error_bow_vecl1 = [dev_set[i] for i in range(100) if i in label1_dev]
error_bow_vecl0 = [dev_set[i] for i in range(100) if i in label0_dev]