In [1]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils import data as td

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

import tqdm
import time
import itertools

In [2]:
class ToxicTextsDataset(td.Dataset):
    def __init__(self, data_path='train.csv', 
                       n_train_batches=16000, 
                       n_test_batches=4000,
                       n_valid_batches=1600,
                       separate_test_and_valid=True,
                       test_size=0.2,
                       valid_size=0.1,
                       batch_size=10, 
                       vocab_size=2000,
                       mode='train',
                       random_seed=None,
                       verbose=0,
                       use_cuda = True):
        """
        INPUT:
            n_train_batches - int, number of batches to be drawn from data for training
            n_test_batches -  int, number of batches to be drawn from data for testing
            n_valid_batches -  int, number of batches to be drawn from data for validation
            separate_test_and_valid - bool, wherever to draw training, testing and validation 
                                      from all data or from separated parts of data (a chance 
                                      of intersection between training, testing and validation 
                                      data if False)
            test_size - float from [0, 1], a portion of initial data reserved for creating 
                        dataset for testing. Not aplicable if separate_test_and_valid=False
            valid_size - float from [0, 1], a portion of initial data reserved for creating 
                         dataset for validation. Not aplicable if separate_test_and_valid=False
            batch_size - int, number of samples in one minibatch
            vocab_size - int, number of unique tokens to save and embed. Saved [vocab_size] 
                         most frequently encountered tokens, all others will be encoded as 
                         UNKNOWN token
            mode = string, one from ['train', 'test', 'valid']. Determinedes from which dataset 
                    will be returned sample on ToxicTextsDataset[i]
            verbose - int, 0 for no printed info, 1 for minimum info, 2 for maximum info
            
        """
        super(ToxicTextsDataset, self).__init__()
        
        self.n_train_batches = n_train_batches
        self.n_test_batches = n_test_batches
        self.n_valid_batches = n_valid_batches
        self.separate_test_and_valid = separate_test_and_valid
        self.test_size = test_size
        self.valid_size = valid_size
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.mode = mode
        self.verbose = verbose
        self.use_cuda = use_cuda
        
        if(random_seed != None):
            np.random.seed(random_seed)
        
        if(verbose): print('Downloading data from ' + data_path + '... ', end='')
        # read csv file
        df = pd.read_csv(data_path)
        if(verbose): print('Completed')
        
        # separate text from class labels
        X = np.array(df.iloc[:, 1])
        y = np.array(df.iloc[:, 2:])
        
        if(verbose): print('Generating vocabulary... ', end='')
        # generating vocabulary of tokens
        self.CreateTokenVocab(X, y)
        if(verbose): print('Completed')
        
        if(separate_test_and_valid == True):
            # split data for
            X_train, X, y_train, y = train_test_split(X, y, test_size=valid_size + test_size)
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X_train, y_train, n_train_batches)
            if(verbose): print('Completed')
            
            if(test_size != 0 and valid_size != 0):
                X_test, X_valid, y_test, y_valid = train_test_split(X, y, 
                                                    test_size=valid_size/(test_size+valid_size))
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                    
            elif(test_size == 0):
                X_valid = X
                y_valid = y
                
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                
                self.test_dataset = []              
                    
            elif(valid_size == 0):
                X_test = X
                y_test = y
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                
                self.valid_dataset = []            
                
        elif(separate_test_and_valid == False):
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X, y, n_train_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating test dataset... ', end='')
            self.test_dataset = self.CreateBalancedDataset(X, y, n_test_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating validation dataset... ', end='')
            self.valid_dataset = self.CreateBalancedDataset(X, y, n_valid_batches)
            if(verbose): print('Completed')
                    
        
    def encode(self, text):
        """ function that splits text into tokens and returns a list of encodings for 
            each token 
                INPUT: text - python string
                OUTPUT: codes - list of integers, 
                        cl_features - list of floats (character level features)
        """
        tokens = self.Smart_Split(text)
        codes = []
        cl_features = self.ComputeCharacterLevelFeatures(text)
        for token in tokens:
            if(self.word_to_id.get(token) != None):
                codes.append(self.word_to_id[token])
            else:
                codes.append(self.vocab_size - 1) # UNKNOWN token
        return codes, cl_features
    
    def ComputeCharacterLevelFeatures(self, text):
        """This function computes a character level features 
           INPUT: text - a python string
           OUTPUT: cl_features - a list of floats
               
               cl_features[0] - lenght of text
               cl_features[1] - mean of lenghts of all tokens in text
               cl_features[2] - ratio of capital letters in text
               cl_features[3] - ratio of non-letter symbols in text
        """
        text_len = float(len(text))
        
        cl_features = [
            text_len,
            np.mean([len(token) for token in self.Smart_Split(text)]),
            len(re.findall(r'[A-Z]', text)) / text_len,
            (1. - len(re.findall(r'[a-zA-Z]', text)) / text_len)
        ]
        
        return cl_features
    
    def CreateBalancedDataset(self, X, y, n_batches):
        """This functions returns a balanced dataset (a list of batched samples with 
           corresponding labels). Produced dataset is drawn with repetition from initial data, 
           and therefore can contain duplicates Depending on n_batches, it will do either 
           undersampling, oversampling or combination of both
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed text 
                     as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels (label != 0 is assumed to be "interesting" )
                 n_batches - integer, number of batches in dataset (so the number of samples 
                             in dataset is equal to n_batches * batch_size = len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th batch 
                            of inputs and dataset[i]['labels'] - corresponding batch of labels"""
        dataset = []
        masks = self.MakeMasks(y)
        n_subbatches = n_batches // len(masks)
        
        if(self.verbose >= 2): print('\n')
        
        for mask in masks:
            if(self.verbose >= 2): print('\tApplying mask: ' + mask['name'] + '... ', end='')
            dataset += self.CreateDatasetFromXY(X[mask['mask']], y[mask['mask']], n_subbatches)
            if(self.verbose >= 2): print('Completed')
                
        return shuffle(dataset)
    
    def CreateDatasetFromXY(self, X, y, n_batches):
        """
        This functions constructs and returns a dataset (a list of batched samples 
        with corresponding labels). 
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                     text as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels
                 n_batches - integer, number of batches in dataset (so the number 
                             of samples in dataset is equal to n_batches * batch_size = 
                             len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th 
                            batch of inputs and dataset[i]['labels'] - corresponding 
                            batch of labels
        
        """
        # we sort our samples on the lenght of the text (in the number of tokens) and 
        # place texts of the same lenght in the same position in this dictionary. 
        # This can be also viewed as a hash-table
        Len_table = dict()
        for i in range(len(X)):
            codes, cl_features = self.encode(X[i])
            if(Len_table.get(len(codes)) != None):
                Len_table[len(codes)].append((codes, cl_features, y[i]))
            else: 
                Len_table[len(codes)] = [(codes, cl_features, y[i])]
        
        # we have different number of samples of different lenght. There is a lot more 
        # samples of lenght ~10-50 tokens and much smaller number of samples of lenght 
        # 100+ tokens. Now we will get a distribution of number of samples:
        dist = np.array([[i, len(Len_table[i])] for i in Len_table.keys()])
        # here dist[i, 0] is some lenght of sample we encountered in dataset
        # and dist[i, 1] is a number of samples of that lenght 
        
        p = dist[:, 1] / np.sum(dist[:, 1])
        
        # we will construct actual dataset, randomly drawing samples from that distribution:
        dataset = []
        for _ in range(n_batches):
            i = np.random.choice(dist[:, 0], p=p)
            sample_indices = np.random.randint(0, len(Len_table[i]), self.batch_size)
            # it took me some time to figure out correct transformation from mess of 
            # lists and numpy array to torch tensor :)
            if(self.use_cuda):
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False).cuda(),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False).cuda(),
                         'labels':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False).cuda()}
            else:
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False),
                         'labels':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False)}
                
            dataset.append(batch)        
        
        return dataset
    
    def CreateTokenVocab(self, X, y):
        '''This function generates a word_to_id dictionary we use for encoding text
        
            INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                       text as elements
                   y - two dimensional np.array of shape (n_samples, n_labels) with 
                       classification labels (label != 0 is assumed to be "interesting" - 
                       we prioretize tokens encoundered in examples with at least one label = 1)
        
        '''
        token_freq = dict()

        # firstly we exctract all tokens we see in positivly labeled samples
        X_relevant = X[np.sum(y, axis=1) > 0] 
        X_relevant += shuffle(X[np.sum(y, axis=1) == 0])[:len(X_relevant)] 
        # we add random portion of "all-negative" data of equal size 
         
        for text in X_relevant:
            tokens = self.Smart_Split(text)

            for token in tokens:
                if(token_freq.get(token) == None):
                    token_freq[token] = 1
                else: token_freq[token] += 1

        tokens = sorted(token_freq, key=token_freq.get)[::-1]

        # secondly, we assign id's to the most frequently encountered tokens in positivly 
        # classified samples
        self.word_to_id = dict()
        for i in range(self.vocab_size - 1):
            self.word_to_id[tokens[i]] = i

        # finally, we would like to find very similar tokens and assign to them the 
        # same id (those are mainly misspells and parsing 
        # innacuracies. For example 'training', 'traning', 'trainnin', 'training"' and so on)
        vec = TfidfVectorizer()
        vec_tokens = vec.fit_transform(tokens)
        same_tokens = ((vec_tokens * vec_tokens.T) > 0.99)
        rows, cols = same_tokens.nonzero()

        for token_pair in zip(rows, cols):
            if(token_pair[0] > self.vocab_size):
                break
            if(token_pair[0] <= token_pair[1]):
                continue
            else:
                self.word_to_id[tokens[token_pair[1]]] = token_pair[0]
    
    def Smart_Split(self, text):
        """Parsing function 
            INPUT: text - python string with any text
            OUTPUT: list of strings, containing tokens
        """
        out = text.strip().lower().replace('\n', ' ')
        out = out.replace(',', ' , ').replace('.', ' . ').replace('!', ' ! ').replace('?', ' ? ')
        out = out.replace(')', ' ) ').replace('(', ' ( ').replace(':', ' : ').replace(';', ' ; ')
        out = out.replace('.  .  .', '...')
        return out.split()

    def MakeMasks(self, y):
        """this function makes masks (bool np.arrays of length y). Each mask is 
        cunstructed so that X[mask] is a part of data grouped by some combination 
        of labels (for example - all data with al labels = 0, or all data with
        first class label = 1 and all other equal to 0, or all data with all 
        labels equal to 1)
            INPUT: y - np.array of shape [n_samples, n_classes]
            OUTPUT: masks - list of bool np.arrays of length y
        """
        
        def not_i_col(y, i):
            """Utility function that returns all columns of y, except i-th"""
            mask = np.array([True, True, True, True, True, True])
            mask[i] = False
            return y[:, mask]

        # mask for data with label_excluded_i = 1 and all others = 0
        # important: there is no data for label_1 = 1 and all others equal to 0, 
        # so skipping that mask
        mask1 = []
        for excluded_i in range(6):
            mask1.append(np.logical_and(y[:, excluded_i] == 1, 
                                        np.sum(not_i_col(y, excluded_i), axis=1) == 0))

        # masks for 2, 3, 4, 5 and 6 labels respectivly equal to 1 (here we do not care, 
        # which label (i.e. label_1, label_2, ...) 
        # is equal to 1, just that there is exactly n=2,3,.. labels equal to 1)
        mask2 = np.sum(y, axis=1) == 2
        mask3 = np.sum(y, axis=1) == 3
        mask4 = np.sum(y, axis=1) == 4
        mask5 = np.sum(y, axis=1) == 5
        mask6 = np.sum(y, axis=1) == 6

        mask0 = (np.sum(y, axis=1) == 0)

        # let's save all masks in one list:
        masks = [{'mask':mask0, 'name':'all-negative data'}, 
                 {'mask':mask1[0], 'name':'only fisrt class labeled positive'},
                 {'mask':mask1[2], 'name':'only third class labeled positive'},
                 {'mask':mask1[3], 'name':'only fourth class labeled positive'},
                 {'mask':mask1[4], 'name':'only fifth class labeled positive'},
                 {'mask':mask1[5], 'name':'only sixth class labeled positive'},
                 {'mask':mask2, 'name':'exactly two positive labels'},
                 {'mask':mask3, 'name':'exactly three positive labels'},
                 {'mask':mask4, 'name':'exactly four positive labels'},
                 {'mask':mask5, 'name':'exactly five positive labels'},
                 {'mask':mask6, 'name':'all-positive data'}]
            
        if(self.verbose >= 2): print('\n\tMasks created (a reminder - no data for "only second class labeled positive")', end='')
        
        return masks
    
    def __getitem__(self, i):
        if(self.mode == 'train'):
            return self.train_dataset[i]
        elif(self.mode == 'test'):
            return self.test_dataset[i]
        elif(self.mode == 'valid'):
            return self.valid_dataset[i]
    
    def __len__(self):
        if(self.mode == 'train'):
            return len(self.train_dataset)
        elif(self.mode == 'test'):
            return len(self.test_dataset)
        elif(self.mode == 'valid'):
            return len(self.valid_dataset)

    def shuffle(self):
        """shuffles dataset, corresponding to current mode"""
        if(self.mode == 'train'):
            self.train_dataset = shuffle(self.train_dataset)
        elif(self.mode == 'test'):
            self.test_dataset = shuffle(self.test_dataset)
        elif(self.mode == 'valid'):
            self.valid_dataset = shuffle(self.valid_dataset)
        

In [3]:
class ToxicTextsDatasetBinary(td.Dataset):
    def __init__(self, label_index,
                       data_path='train.csv', 
                       n_train_batches=4000, 
                       n_test_batches=4000,
                       n_valid_batches=1600,
                       separate_test_and_valid=True,
                       test_size=0.,
                       valid_size=0.3,
                       batch_size=6, 
                       vocab_size=2000,
                       mode='train',
                       random_seed=None,
                       verbose=0,
                       use_cuda = True):
        """
        INPUT:
            n_train_batches - int, number of batches to be drawn from data for training
            n_test_batches -  int, number of batches to be drawn from data for testing
            n_valid_batches -  int, number of batches to be drawn from data for validation
            separate_test_and_valid - bool, wherever to draw training, testing and validation 
                                      from all data or from separated parts of data (a chance 
                                      of intersection between training, testing and validation 
                                      data if False)
            test_size - float from [0, 1], a portion of initial data reserved for creating 
                        dataset for testing. Not aplicable if separate_test_and_valid=False
            valid_size - float from [0, 1], a portion of initial data reserved for creating 
                         dataset for validation. Not aplicable if separate_test_and_valid=False
            batch_size - int, number of samples in one minibatch
            vocab_size - int, number of unique tokens to save and embed. Saved [vocab_size] 
                         most frequently encountered tokens, all others will be encoded as 
                         UNKNOWN token
            mode = string, one from ['train', 'test', 'valid']. Determinedes from which dataset 
                    will be returned sample on ToxicTextsDataset[i]
            verbose - int, 0 for no printed info, 1 for minimum info, 2 for maximum info
            
        """
        super(ToxicTextsDatasetBinary, self).__init__()
        
        self.n_train_batches = n_train_batches
        self.n_test_batches = n_test_batches
        self.n_valid_batches = n_valid_batches
        self.separate_test_and_valid = separate_test_and_valid
        self.test_size = test_size
        self.valid_size = valid_size
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.mode = mode
        self.verbose = verbose
        self.use_cuda = use_cuda
        
        self.label_index = label_index
        
        if(random_seed != None):
            np.random.seed(random_seed)
        
        if(verbose): print('Downloading data from ' + data_path + '... ', end='')
        # read csv file
        df = pd.read_csv(data_path)
        if(verbose): print('Completed')
        
        # separate text from class labels
        X = np.array(df.iloc[:, 1])
        y = np.array(df.iloc[:, 2+label_index])
        
        if(verbose): print('Generating vocabulary... ', end='')
        # generating vocabulary of tokens
        self.CreateTokenVocab(X, y)
        if(verbose): print('Completed')
        
        if(separate_test_and_valid == True):
            # split data for
            X_train, X, y_train, y = train_test_split(X, y, test_size=valid_size + test_size)
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X_train, y_train, n_train_batches)
            if(verbose): print('Completed')
            
            if(test_size != 0 and valid_size != 0):
                X_test, X_valid, y_test, y_valid = train_test_split(X, y, 
                                                    test_size=valid_size/(test_size+valid_size))
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                    
            elif(test_size == 0):
                X_valid = X
                y_valid = y
                
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                
                self.test_dataset = []              
                    
            elif(valid_size == 0):
                X_test = X
                y_test = y
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                
                self.valid_dataset = []            
                
        elif(separate_test_and_valid == False):
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X, y, n_train_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating test dataset... ', end='')
            self.test_dataset = self.CreateBalancedDataset(X, y, n_test_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating validation dataset... ', end='')
            self.valid_dataset = self.CreateBalancedDataset(X, y, n_valid_batches)
            if(verbose): print('Completed')
                    
        
    def encode(self, text):
        """ function that splits text into tokens and returns a list of encodings for 
            each token 
                INPUT: text - python string
                OUTPUT: codes - list of integers, 
                        cl_features - list of floats (character level features)
        """
        tokens = self.Smart_Split(text)
        codes = []
        cl_features = self.ComputeCharacterLevelFeatures(text)
        for token in tokens:
            if(self.word_to_id.get(token) != None):
                codes.append(self.word_to_id[token])
            else:
                codes.append(self.vocab_size - 1) # UNKNOWN token
        return codes, cl_features
    
    def ComputeCharacterLevelFeatures(self, text):
        """This function computes a character level features 
           INPUT: text - a python string
           OUTPUT: cl_features - a list of floats
               
               cl_features[0] - lenght of text
               cl_features[1] - mean of lenghts of all tokens in text
               cl_features[2] - ratio of capital letters in text
               cl_features[3] - ratio of non-letter symbols in text
        """
        text_len = float(len(text))
        
        cl_features = [
            text_len,
            np.mean([len(token) for token in self.Smart_Split(text)]),
            len(re.findall(r'[A-Z]', text)) / text_len,
            (1. - len(re.findall(r'[a-zA-Z]', text)) / text_len)
        ]
        
        return cl_features
    
    def CreateBalancedDataset(self, X, y, n_batches):
        """This functions returns a balanced dataset (a list of batched samples with 
           corresponding labels). Produced dataset is drawn with repetition from initial data, 
           and therefore can contain duplicates Depending on n_batches, it will do either 
           undersampling, oversampling or combination of both
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed text 
                     as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels (label != 0 is assumed to be "interesting" )
                 n_batches - integer, number of batches in dataset (so the number of samples 
                             in dataset is equal to n_batches * batch_size = len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th batch 
                            of inputs and dataset[i]['labels'] - corresponding batch of labels"""
        dataset = []
        n_subbatches = n_batches // 2
        
        mask = (y == 1)
        dataset += self.CreateDatasetFromXY(X[mask], y[mask], n_subbatches)
        
        mask = (y == 0)
        dataset += self.CreateDatasetFromXY(X[mask], y[mask], n_subbatches)
        
        return shuffle(dataset)
    
    def CreateDatasetFromXY(self, X, y, n_batches):
        """
        This functions constructs and returns a dataset (a list of batched samples 
        with corresponding labels). 
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                     text as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels
                 n_batches - integer, number of batches in dataset (so the number 
                             of samples in dataset is equal to n_batches * batch_size = 
                             len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th 
                            batch of inputs and dataset[i]['labels'] - corresponding 
                            batch of labels
        
        """
        # we sort our samples on the lenght of the text (in the number of tokens) and 
        # place texts of the same lenght in the same position in this dictionary. 
        # This can be also viewed as a hash-table
        Len_table = dict()
        for i in range(len(X)):
            codes, cl_features = self.encode(X[i])
            if(Len_table.get(len(codes)) != None):
                Len_table[len(codes)].append((codes, cl_features, y[i]))
            else: 
                Len_table[len(codes)] = [(codes, cl_features, y[i])]
        
        # we have different number of samples of different lenght. There is a lot more 
        # samples of lenght ~10-50 tokens and much smaller number of samples of lenght 
        # 100+ tokens. Now we will get a distribution of number of samples:
        dist = np.array([[i, len(Len_table[i])] for i in Len_table.keys()])
        # here dist[i, 0] is some lenght of sample we encountered in dataset
        # and dist[i, 1] is a number of samples of that lenght 
        
        p = dist[:, 1] / np.sum(dist[:, 1])
        
        # we will construct actual dataset, randomly drawing samples from that distribution:
        dataset = []
        for _ in range(n_batches):
            i = np.random.choice(dist[:, 0], p=p)
            sample_indices = np.random.randint(0, len(Len_table[i]), self.batch_size)
            # it took me some time to figure out correct transformation from mess of 
            # lists and numpy array to torch tensor :)
            if(self.use_cuda):
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False).cuda(),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False).cuda(),
                         'labels':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False).cuda()}
            else:
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False),
                         'labels':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False)}
                
            dataset.append(batch)        
        
        return dataset
    
    def CreateTokenVocab(self, X, y):
        '''This function generates a word_to_id dictionary we use for encoding text
        
            INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                       text as elements
                   y - two dimensional np.array of shape (n_samples, n_labels) with 
                       classification labels (label != 0 is assumed to be "interesting" - 
                       we prioretize tokens encoundered in examples with at least one label = 1)
        
        '''
        token_freq = dict()

        # firstly we exctract all tokens we see in positivly labeled samples
        X_relevant = X[y == 1] 
        X_relevant += shuffle(X[y == 0])[:len(X_relevant)] 
        # we add random portion of "all-negative" data of equal size 
         
        for text in X_relevant:
            tokens = self.Smart_Split(text)

            for token in tokens:
                if(token_freq.get(token) == None):
                    token_freq[token] = 1
                else: token_freq[token] += 1

        tokens = sorted(token_freq, key=token_freq.get)[::-1]

        # secondly, we assign id's to the most frequently encountered tokens in positivly 
        # classified samples
        self.word_to_id = dict()
        for i in range(self.vocab_size - 1):
            self.word_to_id[tokens[i]] = i

        # finally, we would like to find very similar tokens and assign to them the 
        # same id (those are mainly misspells and parsing 
        # innacuracies. For example 'training', 'traning', 'trainnin', 'training"' and so on)
        vec = TfidfVectorizer()
        vec_tokens = vec.fit_transform(tokens)
        same_tokens = ((vec_tokens * vec_tokens.T) > 0.99)
        rows, cols = same_tokens.nonzero()

        for token_pair in zip(rows, cols):
            if(token_pair[0] > self.vocab_size):
                break
            if(token_pair[0] <= token_pair[1]):
                continue
            else:
                self.word_to_id[tokens[token_pair[1]]] = token_pair[0]
    
    def Smart_Split(self, text):
        """Parsing function 
            INPUT: text - python string with any text
            OUTPUT: list of strings, containing tokens
        """
        out = text.strip().lower().replace('\n', ' ')
        out = out.replace(',', ' , ').replace('.', ' . ').replace('!', ' ! ').replace('?', ' ? ')
        out = out.replace(')', ' ) ').replace('(', ' ( ').replace(':', ' : ').replace(';', ' ; ')
        out = out.replace('.  .  .', '...')
        return out.split()

    
    def __getitem__(self, i):
        if(self.mode == 'train'):
            return self.train_dataset[i]
        elif(self.mode == 'test'):
            return self.test_dataset[i]
        elif(self.mode == 'valid'):
            return self.valid_dataset[i]
    
    def __len__(self):
        if(self.mode == 'train'):
            return len(self.train_dataset)
        elif(self.mode == 'test'):
            return len(self.test_dataset)
        elif(self.mode == 'valid'):
            return len(self.valid_dataset)

    def shuffle(self):
        """shuffles dataset, corresponding to current mode"""
        if(self.mode == 'train'):
            self.train_dataset = shuffle(self.train_dataset)
        elif(self.mode == 'test'):
            self.test_dataset = shuffle(self.test_dataset)
        elif(self.mode == 'valid'):
            self.valid_dataset = shuffle(self.valid_dataset)
        

In [4]:
class LSTMClassifier(nn.Module):
    def __init__(self, 
                 vocab_size=20000, 
                 embedding_dim = 100, 
                 hidden_dim=64, 
                 batch_size=10, 
                 conv_channels=32, 
                 use_cuda=True,
                 num_of_cl_features=4):
        """
            A model from paper "A Convolutional Attention Model for Text Classification" 
            by Jiachen Du, Lin Gui, Ruifeng Xu, Yulan He 
            http://tcci.ccf.org.cn/conference/2017/papers/1057.pdf
            With modified outter layer (softmax -> sigmoid) for multilabel classification
            and added character level features
            
        """
        super(LSTMClassifier, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.conv_channels = conv_channels
        self.use_cuda = use_cuda
        self.num_of_cl_features = num_of_cl_features
        
        if(self.use_cuda):
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim).cuda()
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True).cuda()
                 # // 2 is because we would like to concat hidden states, 
                # calculated from both sides of LSTM and aquire exactly hidden_dim
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2).cuda()
    
            self.linear = nn.Linear(conv_channels, 1).cuda()
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 6).cuda() 
            # we have 6 classes to predict
        else:
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True)
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2)
    
            self.linear = nn.Linear(conv_channels, 1)
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 6) # we have 6 classes to predict
            
        self.init_hidden()
        
#         self.attention = nn.Sequential(
#             nn.Conv2d(in_channels=1, out_channels=conv_channels, kernel_size=(3, 6), stride=1, padding=(1, 0)),
#             nn.MaxPool2d(kernel_size=(3, 2), stride=(1, 2), padding=(1, 0)),
#             nn.Conv2d(in_channels=conv_channels, out_channels=conv_channels, kernel_size=(3, 6), stride=1, padding=(1, 0)),
#             nn.MaxPool2d(kernel_size=(3, 2), stride=(1, 2), padding=(1, 0)),
#             nn.Conv2d(in_channels=conv_channels, out_channels=conv_channels, kernel_size=(3, 6), stride=1, padding=(1, 0)),
#             nn.MaxPool2d(kernel_size=(3, 2), stride=(1, 2), padding=(1, 0)),
#             nn.Conv2d(in_channels=conv_channels, out_channels=conv_channels, kernel_size=(3, 6), stride=1, padding=(1, 0)),
#             nn.MaxPool2d(kernel_size=(3, 3), stride=(1, 1), padding=(1, 0))
#         )
        
        
        
    def init_hidden(self):
        if(self.use_cuda):
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda(), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda())
        else:
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)))
    
    def forward(self, input_seq, cl_features=None):
        embed = self.embeddings(input_seq)
        output, _ = self.lstm(embed, self.hidden)
        
        conv_out = self.conv(embed.permute(0, 2, 1))
        
        attention_tensor = torch.mean(conv_out, dim=1)
        
        features = torch.sum(output * attention_tensor.resize(attention_tensor.data.shape[0], attention_tensor.data.shape[1], 1), dim=1)
        
        if(cl_features is not None and self.num_of_cl_features == cl_features.data.shape[1]):
            features = torch.cat((features, cl_features), dim=1)
        elif(cl_features is not None and self.num_of_cl_features != cl_features.data.shape[1]):
            print("""Recieved unexpected number of character level features. 
                     Model expected to recieve {} features, but received {}. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features, cl_features.data.shape[1]))
            raise ValueError()
        elif(cl_features is None and self.num_of_cl_features > 0):
            print("""Model expected to recieve {} features, but received None. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features))
            raise ValueError()
            
        predictions = nn.functional.sigmoid(self.linear_final(features))
        
        return predictions

In [None]:
lr = 1e-3
weight_decay = 1e-5
cross_validation = 3

vocab_sizes = [3000, 5000, 7000]
embedding_dim = 200
hidden_dim = 100
conv_channels = 32

epochs = 15
batch_size = 6 # big batch sizes are not recomended, 
               # since a lot of batches have 1 or 2 samples, repeated batch_size times.
               # for now a batch_size of 5 to 15 seems reasonable
use_cuda = True

train_stats = []

for vocab_size in tqdm.tqdm(vocab_sizes):
    train_stats.append({'vocab_sizes':vocab_size, 'train_losses':[], 'valid_losses':[], 'val_f1_scores':[]})
    
    for _ in range(cross_validation):

        dataset = ToxicTextsDataset(n_train_batches=3000, 
                                    n_test_batches=50, 
                                    n_valid_batches=1000,
                                    valid_size=0.3,
                                    test_size=0.,
                                    batch_size=batch_size, 
                                    vocab_size=vocab_size, 
                                    verbose=0,
                                    use_cuda = use_cuda)

        Multiple_gpus = False

        model = LSTMClassifier(vocab_size=vocab_size, 
                               embedding_dim = embedding_dim, 
                               hidden_dim=hidden_dim, 
                               conv_channels=conv_channels,
                               batch_size=batch_size, 
                               use_cuda=use_cuda)

        # todo:

        # if (Multiple_gpus and torch.cuda.device_count() > 1):
        #     print("Detected {} gpu's. Using {} of them.".format(torch.cuda.device_count(), torch.cuda.device_count()))
        #     model.num_gpus = torch.cuda.device_count()
        #     model = nn.DataParallel(model, dim=0)
        # else:
        #     Multiple_gpus = False

        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

        if(use_cuda): loss_function = nn.MultiLabelSoftMarginLoss().cuda()
        else: loss_function = nn.MultiLabelSoftMarginLoss()

        train_stats[-1]['train_losses'].append([0])
        train_stats[-1]['valid_losses'].append([0])
        train_stats[-1]['val_f1_scores'].append([0])

#         print('=========================================')
#         print("Start of the training.")
        start = time.time()

        for i in range(epochs):

            all_predictions = Variable(torch.zeros(1, 6))
            all_true_labels = Variable(torch.zeros(1, 6))

            for mode in ['train', 'valid']:
                dataset.mode = mode
                dataset.shuffle()
                for sample in dataset:
                    if(mode == 'train'):
                        optimizer.zero_grad()

                        if(Multiple_gpus):
                            model.module.init_hidden()
                        else:
                            model.init_hidden()

                        pred = model.forward(sample['input'], sample['cl_features'])

                        loss = loss_function(pred, sample['labels'])

                        loss.backward()
                        optimizer.step()
                        train_stats[-1]['train_losses'][-1][-1] += loss.data[0]
                    else:
                        if(Multiple_gpus):
                            model.module.init_hidden()
                        else:
                            model.init_hidden()

                        pred = model.forward(sample['input'], sample['cl_features'])
                        train_stats[-1]['valid_losses'][-1][-1] += loss_function(pred, sample['labels']).data[0]

                        all_predictions = torch.cat((all_predictions, pred.cpu()))
                        all_true_labels = torch.cat((all_true_labels, sample['labels'].cpu()))


            all_predictions = all_predictions.data.numpy()
            all_true_labels = all_true_labels.data.numpy()

            all_predictions = (all_predictions - 0.5 > 0).astype(int)

            train_stats[-1]['val_f1_scores'][-1][-1] = f1_score(all_true_labels, all_predictions, average='weighted')
            
            
#             print('Epoch {:03d}; train loss = {:4.2f}; validation loss = {:2.2f}; validation F1 score = {:0.2f}; ETA = {:3.0f} s'.format(i, 
#                                                                              train_stats[-1]['train_losses'][-1][-1], 
#                                                                              train_stats[-1]['valid_losses'][-1][-1], 
#                                                                              train_stats[-1]['val_f1_scores'][-1][-1],
#                                                                             (epochs - i)*(time.time() - start)/(i+1)))
            train_stats[-1]['train_losses'][-1].append(0)
            train_stats[-1]['valid_losses'][-1].append(0)
            train_stats[-1]['val_f1_scores'][-1].append(0)

In [11]:
data = []
for vs, s in zip(vocab_sizes, train_stats):
    data.append([vs, np.mean(np.array(s['val_f1_scores'])[:, -6:-1]), np.std(np.array(s['val_f1_scores'])[:, -6:-1])])
    
data = np.array(data)
df = pd.DataFrame(data, columns=['vocab_sizes', 'mean f1 score', 'std'])
df.to_csv('output.csv')

In [38]:
model_tmp = ClassifierBinary(0)

In [46]:
out = model_tmp.forward(tmp[1]['input'], tmp[1]['cl_features'])

In [47]:
out

Variable containing:
 0.0936  0.9064
 0.0510  0.9490
 0.0435  0.9565
 0.0318  0.9682
 0.1032  0.8968
 0.0978  0.9022
[torch.cuda.FloatTensor of size 6x2 (GPU 0)]

In [5]:
class ClassifierBinary(nn.Module):
    def __init__(self, 
                 label_index,
                 vocab_size=2000, 
                 embedding_dim = 100, 
                 hidden_dim=200, 
                 batch_size=6, 
                 conv_channels=32, 
                 use_cuda=True,
                 num_of_cl_features=4):
        
        super(ClassifierBinary, self).__init__()
        """
            A model from paper "A Convolutional Attention Model for Text Classification" 
            by Jiachen Du, Lin Gui, Ruifeng Xu, Yulan He 
            http://tcci.ccf.org.cn/conference/2017/papers/1057.pdf
            With added character level features
            
        """
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.conv_channels = conv_channels
        self.use_cuda = use_cuda
        self.num_of_cl_features = num_of_cl_features
        
        self.label_index = label_index
        
        if(self.use_cuda):
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim).cuda()
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True).cuda()
                 # // 2 is because we would like to concat hidden states, 
                # calculated from both sides of LSTM and aquire exactly hidden_dim
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2).cuda()
    
            self.linear = nn.Linear(conv_channels, 1).cuda()
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 2).cuda()
        else:
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True)
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2)
    
            self.linear = nn.Linear(conv_channels, 1)
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 2)
            
        self.init_hidden()
        
        
    def init_hidden(self):
        if(self.use_cuda):
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda(), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda())
        else:
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)))
    
    def forward(self, input_seq, cl_features=None):
        embed = self.embeddings(input_seq)
        output, _ = self.lstm(embed, self.hidden)
        
        conv_out = self.conv(embed.permute(0, 2, 1))
        
        attention_tensor = torch.mean(conv_out, dim=1)
        
        features = torch.sum(output * attention_tensor.resize(attention_tensor.data.shape[0], attention_tensor.data.shape[1], 1), dim=1)
        
        if(cl_features is not None and self.num_of_cl_features == cl_features.data.shape[1]):
            features = torch.cat((features, cl_features), dim=1)
        elif(cl_features is not None and self.num_of_cl_features != cl_features.data.shape[1]):
            print("""Recieved unexpected number of character level features. 
                     Model expected to recieve {} features, but received {}. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features, cl_features.data.shape[1]))
            raise ValueError()
        elif(cl_features is None and self.num_of_cl_features > 0):
            print("""Model expected to recieve {} features, but received None. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features))
            raise ValueError()
            
        predictions = nn.functional.softmax(self.linear_final(features), dim=1)
        
        return predictions
    
    def train_ (self, 
               n_train_batches=2000,
               n_valid_batches=500,
               lr = 1e-3,
               weight_decay = 1e-5,
               epochs = 15):
        
        self.dataset = ToxicTextsDatasetBinary(self.label_index, 
                                          n_train_batches=n_train_batches, 
                                          n_valid_batches=n_valid_batches)
                   
        optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

        if(self.use_cuda): loss_function = nn.CrossEntropyLoss().cuda()
        else: loss_function = nn.CrossEntropyLoss()

#         train_stats = {'train_losses':[], 'valid_losses':[], 'val_f1_scores':[]}
        
#         train_stats['train_losses'].append(0)
#         train_stats['valid_losses'].append(0)
#         train_stats['val_f1_scores'].append(0)

#         start = time.time()

        for i in range(epochs):

#             all_predictions = torch.zeros(1, 1)
#             all_true_labels = torch.zeros(1, 1)

#             for mode in ['train', 'valid']:
#                 dataset.mode = mode
            self.dataset.shuffle()
            for sample in self.dataset:
#                 if(mode == 'train'):
                optimizer.zero_grad()

                self.init_hidden()

                pred = self.forward(sample['input'], sample['cl_features'])

                loss = loss_function(pred, sample['labels'])

                loss.backward()
                optimizer.step()
#                 train_stats['train_losses'][-1] += loss.data[0]
#                     else:
#                         self.init_hidden()

#                         pred = self.forward(sample['input'], sample['cl_features'])
#                         train_stats['valid_losses'][-1] += loss_function(pred, sample['labels']).data[0]

#                         _, pred = torch.max(pred.data, 1)
#                         all_predictions = torch.cat((all_predictions, torch.FloatTensor(pred.cpu().numpy())))
#                         all_true_labels = torch.cat((all_true_labels, torch.FloatTensor(
#                                                                         sample['labels'].data.cpu().numpy())))


#             all_predictions = all_predictions.numpy()
#             all_true_labels = all_true_labels.numpy()

#             all_predictions = (all_predictions - 0.5 > 0).astype(int)

#             train_stats['val_f1_scores'][-1] = f1_score(all_true_labels, all_predictions)
            
            
#             print('Epoch {:03d}; train loss = {:4.2f}; validation loss = {:2.2f}; validation F1 score = {:0.2f}; ETA = {:3.0f} s'.format(i, 
#                                                                              train_stats['train_losses'][-1], 
#                                                                              train_stats['valid_losses'][-1], 
#                                                                              train_stats['val_f1_scores'][-1],
#                                                                             (epochs - i)*(time.time() - start)/(i+1)))
#             train_stats['train_losses'].append(0)
#             train_stats['valid_losses'].append(0)
#             train_stats['val_f1_scores'].append(0)
        
#         return train_stats
    def predict(self, X):
        
        code, cl_features = self.dataset.encode(X)
        
        Input = Variable(torch.LongTensor(np.array(codes)), 
                    requires_grad=False).cuda(),
        
        cl_features = Variable(torch.FloatTensor(np.array(cl_features)), 
                    requires_grad=False).cuda()
        
        pred = self.forward(Input, cl_features)
                       
#         _, pred = torch.max(pred.data, 1)
        
        return pred.data[:, 1]
                               
class EnsembleBinary(nn.Module):
    def __init__(self, 
                 vocab_size=2000, 
                 embedding_dim = 100, 
                 hidden_dim=200, 
                 batch_size=6, 
                 conv_channels=32, 
                 use_cuda=True,
                 num_of_cl_features=4):
        super(EnsembleBinary, self).__init__()
        
        self.classifier1 = ClassifierBinary(0, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        self.classifier2 = ClassifierBinary(1, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        self.classifier3 = ClassifierBinary(2, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        self.classifier4 = ClassifierBinary(3, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        self.classifier5 = ClassifierBinary(4, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        self.classifier6 = ClassifierBinary(5, vocab_size, embedding_dim, hidden_dim, batch_size, conv_channels, use_cuda, num_of_cl_features)
        
    def train_(self, 
           n_train_batches=2000,
           n_valid_batches=500,
           lr = 1e-3,
           weight_decay = 1e-5,
           epochs = 15):

        train_stats1 = self.classifier1.train_(n_train_batches, n_valid_batches, lr, weight_decay, 3)
        train_stats2 = self.classifier2.train_(n_train_batches, n_valid_batches, lr, weight_decay, 3)
        train_stats3 = self.classifier3.train_(n_train_batches, n_valid_batches, lr, weight_decay, 3)
        train_stats4 = self.classifier4.train_(n_train_batches, n_valid_batches, lr, weight_decay, 3)
        train_stats5 = self.classifier5.train_(n_train_batches, n_valid_batches, lr, weight_decay, 4)
        train_stats6 = self.classifier6.train_(n_train_batches, n_valid_batches, lr, weight_decay, 5)

        return train_stats1, train_stats2, train_stats3, train_stats4, train_stats5, train_stats6
    
    def predict(self, X):
        
        return torch.stack((self.classifier1.predict(X), 
                            self.classifier2.predict(X),
                            self.classifier3.predict(X),
                            self.classifier4.predict(X), 
                            self.classifier5.predict(X),
                            self.classifier6.predict(X))).transpose(0, 1)
    
    

In [8]:
class Ensemble():
    def __init__(self, n_classifiers_per_label=1):
        
        self.classifiers = []
        
        for i in range(n_classifiers_per_label):
            self.classifiers.append(EnsembleBinary())
            self.classifiers[-1].train_()
            print('trained {}-th ensemble of binary classifiers!'.format(i))

            del self.classifiers[-1].classifier1.dataset.train_dataset
            del self.classifiers[-1].classifier2.dataset.train_dataset
            del self.classifiers[-1].classifier3.dataset.train_dataset
            del self.classifiers[-1].classifier4.dataset.train_dataset
            del self.classifiers[-1].classifier5.dataset.train_dataset
            del self.classifiers[-1].classifier6.dataset.train_dataset
            
    def predict(self, X):
        
        predictions = []
        
        for classifier in self.classifiers:
            predictions.append(classifier.predict(X).numpy())
        
        return np.mean(predictions, axis=0)

In [153]:
t = ToxicTextsDataset(n_train_batches=0, n_test_batches=0, n_valid_batches=0)

In [155]:
del t.train_dataset

In [10]:
E.predict('some text')

NameError: name 'dataset' is not defined

In [9]:
E = Ensemble()

trained 0-th ensemble of binary classifiers!


In [138]:
a = [torch.zeros(3), torch.ones(3), torch.ones(3)]

In [125]:
t = torch.zeros(3, 2)
for i in a:
    t += i

In [140]:
torch.mean(a)

TypeError: torch.mean received an invalid combination of arguments - got (list), but expected one of:
 * (torch.FloatTensor source)
      didn't match because some of the arguments have invalid types: ([31;1mlist[0m)
 * (torch.FloatTensor source, int dim)
 * (torch.FloatTensor source, int dim, bool keepdim)


In [127]:
t = t / 3

In [130]:
t = (t < 0.5)

In [133]:
t.numpy()

array([0, 0, 0], dtype=uint8)

In [89]:
EB = EnsembleBinary()

TypeError: __init__() got an unexpected keyword argument 'n_valid_batches'

In [90]:
EB.train_(n_valid_batches=0)

Epoch 000; train loss = 1235.71; validation loss = 0.00; validation F1 score = 0.00; ETA = 259 s
Epoch 001; train loss = 1141.01; validation loss = 0.00; validation F1 score = 0.00; ETA = 232 s


KeyboardInterrupt: 

In [59]:
data = ToxicTextsDatasetBinary(1)

In [62]:
data[0]['labels'].data.cpu()


 1
 1
 1
 1
 1
 1
[torch.LongTensor of size 6]

In [64]:
data[0]

{'cl_features': Variable containing:
  37.0000   2.9000   0.0270   0.2432
  51.0000   4.3000   0.0000   0.1765
  47.0000   4.0000   0.0213   0.1915
  47.0000   4.1000   0.0213   0.2340
  44.0000   3.7000   0.0682   0.2273
  43.0000   3.3000   0.0465   0.2558
 [torch.cuda.FloatTensor of size 6x4 (GPU 0)], 'input': Variable containing:
  1786   457   177    33    21   441    11    61   101     0
   653  1872   192    44    76    29   125    12   555     2
  1034  1999   125    74   753  1034  1999     2  1999     2
   568    19     3   211  1999   300     3    24   689    63
   304    99     7    24    47     2    29    71    31     2
   227   125  1872   277   142    76    38     7    72     2
 [torch.cuda.LongTensor of size 6x10 (GPU 0)], 'labels': Variable containing:
  1
  1
  1
  1
  1
  1
 [torch.cuda.LongTensor of size 6 (GPU 0)]}

In [134]:
pred = EB.classifier1.forward(data[0]['input'], data[0]['cl_features'])

In [95]:
 _, pred = torch.max(pred.data, 1)

In [96]:
 _, pred1 = torch.max(pred1.data, 1)

In [93]:
pred1 = EB.classifier2.forward(data[0]['input'], data[0]['cl_features'])

In [137]:
torch.max(pred.data, 1)

(
  1.0000
  1.0000
  0.9988
  1.0000
  0.9613
  0.9999
 [torch.cuda.FloatTensor of size 6 (GPU 0)], 
  0
  0
  1
  1
  1
  0
 [torch.cuda.LongTensor of size 6 (GPU 0)])

In [136]:
pred

Variable containing:
 1.0000  0.0000
 1.0000  0.0000
 0.0012  0.9988
 0.0000  1.0000
 0.0387  0.9613
 0.9999  0.0001
[torch.cuda.FloatTensor of size 6x2 (GPU 0)]

In [108]:
torch.stack((pred.cpu(), pred1.cpu())).transpose(0, 1)


 0  1
 0  0
 1  1
 1  0
 1  0
 0  1
[torch.LongTensor of size 6x2]

In [70]:
z = torch.zeros(1, 1)

In [79]:
torch.FloatTensor(pred.cpu().numpy())


 0
 0
 0
 1
 0
 0
[torch.FloatTensor of size 6]

In [85]:
torch.FloatTensor(torch.LongTensor([1]).numpy())


 1
[torch.FloatTensor of size 1]

In [80]:
torch.cat((z, torch.FloatTensor(pred.cpu().numpy())))


    0
    0
    0
    0
    1
    0
    0
[torch.FloatTensor of size 7x1]

In [57]:
a = torch.LongTensor(torch.ones(2, 3))
b = torch.LongTensor(torch.zeros(4, 3))
torch.cat((a, b))

TypeError: torch.LongTensor constructor received an invalid combination of arguments - got (torch.FloatTensor), but expected one of:
 * no arguments
 * (int ...)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.LongTensor viewed_tensor)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.Size size)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (torch.LongStorage data)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)
 * (Sequence data)
      didn't match because some of the arguments have invalid types: ([31;1mtorch.FloatTensor[0m)


In [6]:
# plt.figure(figsize=(16, 10))
# for i in range(9):
#     x = np.arange(10)
#     y = np.mean(np.array(train_stats[i]['val_f1_scores'])[:, :-1], axis=0)
#     std = np.std(np.array(train_stats[i]['val_f1_scores'])[:, :-1], axis=0)
        
#     plt.errorbar(x, y, yerr=std, label='lr = {}, wd = {}'.format(train_stats[i]['lr'], train_stats[i]['weight_decay']))

# plt.legend()

In [7]:
# from mpl_toolkits.mplot3d import Axes3D

# fig = plt.figure(figsize=(9, 9))
# ax = fig.gca(projection='3d')

# ax.view_init(30, 90)

# xx, yy = np.log(np.meshgrid(learning_rates, weight_decays))

# tmp_lr = {0.01:0, 0.001:1, 0.0001:2}
# tmp_wd = {0.001:0, 0.0001:1, 0.00001:2}

# z = np.zeros((3, 3))
# z_std = np.zeros((3, 3))
# for s in train_stats:
#     z[tmp_lr[s['lr']], tmp_wd[s['weight_decay']]] = np.mean(np.array(s['val_f1_scores'])[:, -6:-1])
#     z_std[tmp_lr[s['lr']], tmp_wd[s['weight_decay']]] = np.std(np.array(s['val_f1_scores'])[:, -6:-1])

# ax.plot_surface(xx, yy, z, cmap=plt.cm.coolwarm)

In [8]:
# data = np.vstack((np.exp(xx).flatten(), np.exp(yy).flatten(), z.flatten(), z_std.flatten())).T

In [9]:
# df = pd.DataFrame(data, columns=['learning rate', 'weight decay', 'mean f1 score', 'std'])

In [10]:
# df.to_csv('output.csv')