In [1]:
from pprint import pprint

# Constants/parameters
SUBMISSION_ID = 73
OUT_PATH = '../results/'
IN_PATH = '../data/'
WORD_EMBEDDINGS_PATH = '../word_embeddings/model_1/'
MODEL_NAME = 'model_sg_1_size_200_min_count_2_negative_10_window_10'
TARGET_CLASSES = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

## 1. Load data

In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv(IN_PATH + 'train.csv', encoding='utf8')

In [4]:
import json

def load_preprocessed_data(filename):
    data = []
    with open(filename) as fi:
        data = json.load(fi)
        
    return data

In [5]:
train_preprocessed = load_preprocessed_data(IN_PATH + 'train_preprocessed.json')
test_preprocessed = load_preprocessed_data(IN_PATH + 'test_preprocessed.json')

In [6]:
print(type(train_preprocessed))

<type 'list'>


## 2. Load model

In [7]:
import gensim

model = gensim.models.Word2Vec.load(WORD_EMBEDDINGS_PATH + MODEL_NAME)

## 3. Representation of comments as average of word embeddings

In [8]:
import numpy as np
from tqdm import tqdm

def get_comments_embeddings(comments, word_emb_dims=200):
    comments_embeddings = np.zeros((len(comments), word_emb_dims))
    comments_emb_nr = [0 for _ in range(len(comments))]
    
    for i in tqdm(range(len(comments))):
        for word in comments[i]:
            if model.wv.vocab.has_key(word):
                comments_embeddings[i] += model.wv[word]
                comments_emb_nr[i] += 1

    # a comment is represented by the average of word_embeddings
    for i in range(len(comments_embeddings)):
        if comments_emb_nr[i] != 0:
            comments_embeddings[i] = comments_embeddings[i] / comments_emb_nr[i]

    return comments_embeddings

In [9]:
train_comments = get_comments_embeddings(train_preprocessed, 200)
test_comments = get_comments_embeddings(test_preprocessed, 200)

100%|██████████| 159571/159571 [00:20<00:00, 7797.19it/s]
100%|██████████| 153164/153164 [00:18<00:00, 8306.25it/s]


In [10]:
train_x = train_comments
train_y = train_df[TARGET_CLASSES].as_matrix()
test_x = test_comments

## 4. Classification

In [11]:
from sklearn import preprocessing
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [12]:
def svm_classifier(train_x, train_y, test_x, method='LinearSVC'):
    print('---svm_classifier---')
    scaler = preprocessing.StandardScaler().fit(train_x)
    train_x = scaler.transform(train_x)
    test_x = scaler.transform(test_x)

    pred = np.zeros((test_x.shape[0], len(TARGET_CLASSES)))

    for i in tqdm(range(len(TARGET_CLASSES))):
        #print('class {}'.format(i))
        classifier = None
        if method == 'LinearSVC':
            classifier = svm.LinearSVC()
        elif method == 'SVC':
            classifier = svm.SVC(C=1.3)

        classifier.fit(train_x, train_y[:,i])
        pred[:,i] = classifier.predict(test_x)

    return pred

In [13]:
def nb_classifier(train_x, train_y, test_x):
    print('---nb_classifier---')
    preds = np.zeros((test_x.shape[0], len(TARGET_CLASSES)))

    for i in tqdm(range(len(TARGET_CLASSES))):
        #print('step: {}'.format(i))
        classifier = GaussianNB()
        classifier.fit(train_x, train_y[:,i])
        preds[:,i] = classifier.predict_proba(test_x)[:,1]
    
    return preds

In [14]:
def logistic_regression(train_x, train_y, test_x):
    print('---logistic_regression---')
    preds = np.zeros((len(test_x), len(TARGET_CLASSES)))

    for i in tqdm(range(len(TARGET_CLASSES))):
        lr_model = LogisticRegression(C=4, dual=True)#, class_weight='balanced')
        lr_model.fit(train_x, train_y[:,i])
        preds[:,i] = lr_model.predict_proba(test_x)[:,1]
    
    return preds

In [15]:
def classification(train_x, train_y, test_x, classifier='NB'):
    preds = []
    if classifier == 'SVM':
        preds = svm_classifier(train_x, train_y, test_x, method='LinearSVC')
    elif classifier == 'NB':
        preds = nb_classifier(train_x, train_y, test_x)
    elif classifier == 'LR':
        preds = logistic_regression(train_x, train_y, test_x)
    
    return preds

In [16]:
def write_results(pred, in_path, out_path):
    print('---write_results---')
    res_df = pd.read_csv(in_path)

    idx = 0
    for x in pred:
        # TODO: x.toarray()[0] for nb_classifier output
        #       x for svm_classifier output
        probs = x #x.toarray()[0]
        
        for k in range(len(TARGET_CLASSES)):
            res_df[TARGET_CLASSES[k]].set_value(idx, probs[k])
        
        idx += 1
    
    res_df.to_csv(out_path, index=False)

In [17]:
def run_experiment(train_x, train_y, test_x, classifier, in_path, out_path):
    preds = classification(train_x, train_y, test_x, classifier)
    write_results(preds, in_path, out_path)
    return preds

In [18]:
def run_all():
    experiment_names = ['SVM', 'NB', 'LR']
    preds = list()
    
    for i in tqdm(range(len(experiment_names))):
        preds.append(run_experiment(train_x, train_y, test_x, 
                                    experiment_names[i], 
                                    IN_PATH + 'sample_submission.csv', 
                                    OUT_PATH + 'submission_' + str(SUBMISSION_ID + i) + '.csv'))
    return preds

In [19]:
preds = run_all()

  0%|          | 0/3 [00:00<?, ?it/s]

---svm_classifier---



  0%|          | 0/6 [00:00<?, ?it/s][A
 17%|█▋        | 1/6 [01:36<08:03, 96.71s/it][A
 33%|███▎      | 2/6 [03:11<06:23, 95.90s/it][A
 50%|█████     | 3/6 [04:48<04:48, 96.13s/it][A
 67%|██████▋   | 4/6 [06:17<03:08, 94.42s/it][A
 83%|████████▎ | 5/6 [08:01<01:36, 96.27s/it][A
100%|██████████| 6/6 [09:39<00:00, 96.59s/it][A
[A

---write_results---


 33%|███▎      | 1/3 [09:44<19:28, 584.28s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

---nb_classifier---



 17%|█▋        | 1/6 [00:00<00:03,  1.29it/s][A
 33%|███▎      | 2/6 [00:01<00:03,  1.29it/s][A
 50%|█████     | 3/6 [00:02<00:02,  1.28it/s][A
 67%|██████▋   | 4/6 [00:03<00:01,  1.27it/s][A
 83%|████████▎ | 5/6 [00:03<00:00,  1.27it/s][A
100%|██████████| 6/6 [00:04<00:00,  1.27it/s][A
[A

---write_results---


 67%|██████▋   | 2/3 [09:53<04:56, 296.84s/it]
  0%|          | 0/6 [00:00<?, ?it/s][A

---logistic_regression---



 17%|█▋        | 1/6 [00:13<01:09, 13.96s/it][A
 33%|███▎      | 2/6 [00:27<00:55, 13.79s/it][A
 50%|█████     | 3/6 [00:40<00:40, 13.62s/it][A
 67%|██████▋   | 4/6 [00:52<00:26, 13.21s/it][A
 83%|████████▎ | 5/6 [01:05<00:13, 13.05s/it][A
100%|██████████| 6/6 [01:18<00:00, 13.00s/it][A
[A

---write_results---


100%|██████████| 3/3 [11:15<00:00, 225.31s/it]
