In [260]:
import pandas as pd
import numpy as np
import re

import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils import data as td

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score

import tqdm
import time
import itertools

In [2]:
class ToxicTextsDatasetBinary(td.Dataset):
    def __init__(self, label_index,
                       data_path='train.csv', 
                       n_train_batches=4000, 
                       n_test_batches=4000,
                       n_valid_batches=1600,
                       separate_test_and_valid=True,
                       test_size=0.,
                       valid_size=0.3,
                       batch_size=6, 
                       vocab_size=2000,
                       mode='train',
                       random_seed=None,
                       verbose=0,
                       use_cuda = True):
        """
        INPUT:
            n_train_batches - int, number of batches to be drawn from data for training
            n_test_batches -  int, number of batches to be drawn from data for testing
            n_valid_batches -  int, number of batches to be drawn from data for validation
            separate_test_and_valid - bool, wherever to draw training, testing and validation 
                                      from all data or from separated parts of data (a chance 
                                      of intersection between training, testing and validation 
                                      data if False)
            test_size - float from [0, 1], a portion of initial data reserved for creating 
                        dataset for testing. Not aplicable if separate_test_and_valid=False
            valid_size - float from [0, 1], a portion of initial data reserved for creating 
                         dataset for validation. Not aplicable if separate_test_and_valid=False
            batch_size - int, number of samples in one minibatch
            vocab_size - int, number of unique tokens to save and embed. Saved [vocab_size] 
                         most frequently encountered tokens, all others will be encoded as 
                         UNKNOWN token
            mode = string, one from ['train', 'test', 'valid']. Determinedes from which dataset 
                    will be returned sample on ToxicTextsDataset[i]
            verbose - int, 0 for no printed info, 1 for minimum info, 2 for maximum info
            
        """
        super(ToxicTextsDatasetBinary, self).__init__()
        
        self.n_train_batches = n_train_batches
        self.n_test_batches = n_test_batches
        self.n_valid_batches = n_valid_batches
        self.separate_test_and_valid = separate_test_and_valid
        self.test_size = test_size
        self.valid_size = valid_size
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.mode = mode
        self.verbose = verbose
        self.use_cuda = use_cuda
        
        self.label_index = label_index
        
        if(random_seed != None):
            np.random.seed(random_seed)
        
        if(verbose): print('Downloading data from ' + data_path + '... ', end='')
        # read csv file
        df = pd.read_csv(data_path)
        if(verbose): print('Completed')
        
        # separate text from class labels
        X = np.array(df.iloc[:, 1])
        y = np.array(df.iloc[:, 2+label_index])
        
        if(verbose): print('Generating vocabulary... ', end='')
        # generating vocabulary of tokens
        self.CreateTokenVocab(X, y)
        if(verbose): print('Completed')
        
        if(separate_test_and_valid == True):
            # split data for
            X_train, X, y_train, y = train_test_split(X, y, test_size=valid_size + test_size)
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X_train, y_train, n_train_batches)
            if(verbose): print('Completed')
            
            if(test_size != 0 and valid_size != 0):
                X_test, X_valid, y_test, y_valid = train_test_split(X, y, 
                                                    test_size=valid_size/(test_size+valid_size))
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                    
            elif(test_size == 0):
                X_valid = X
                y_valid = y
                
                if(verbose): print('Creating validation dataset... ', end='')
                self.valid_dataset = self.CreateBalancedDataset(X_valid, y_valid, n_valid_batches)
                if(verbose): print('Completed')
                
                self.test_dataset = []              
                    
            elif(valid_size == 0):
                X_test = X
                y_test = y
                
                if(verbose): print('Creating test dataset... ', end='')
                self.test_dataset = self.CreateBalancedDataset(X_test, y_test, n_test_batches)
                if(verbose): print('Completed')
                
                self.valid_dataset = []            
                
        elif(separate_test_and_valid == False):
            
            if(verbose): print('Creating train dataset... ', end='')
            self.train_dataset = self.CreateBalancedDataset(X, y, n_train_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating test dataset... ', end='')
            self.test_dataset = self.CreateBalancedDataset(X, y, n_test_batches)
            if(verbose): print('Completed')
            
            if(verbose): print('Creating validation dataset... ', end='')
            self.valid_dataset = self.CreateBalancedDataset(X, y, n_valid_batches)
            if(verbose): print('Completed')
                    
        
    def encode(self, text):
        """ function that splits text into tokens and returns a list of encodings for 
            each token 
                INPUT: text - python string
                OUTPUT: codes - list of integers, 
                        cl_features - list of floats (character level features)
        """
        tokens = self.Smart_Split(text)
        codes = []
        cl_features = self.ComputeCharacterLevelFeatures(text)
        for token in tokens:
            if(self.word_to_id.get(token) != None):
                codes.append(self.word_to_id[token])
            else:
                codes.append(self.vocab_size - 1) # UNKNOWN token
        return codes, cl_features
    
    def ComputeCharacterLevelFeatures(self, text):
        """This function computes a character level features 
           INPUT: text - a python string
           OUTPUT: cl_features - a list of floats
               
               cl_features[0] - lenght of text
               cl_features[1] - mean of lenghts of all tokens in text
               cl_features[2] - ratio of capital letters in text
               cl_features[3] - ratio of non-letter symbols in text
        """
        text_len = float(len(text))
        
        cl_features = [
            text_len,
            np.mean([len(token) for token in self.Smart_Split(text)]),
            len(re.findall(r'[A-Z]', text)) / text_len,
            (1. - len(re.findall(r'[a-zA-Z]', text)) / text_len)
        ]
        
        return cl_features
    
    def CreateBalancedDataset(self, X, y, n_batches):
        """This functions returns a balanced dataset (a list of batched samples with 
           corresponding labels). Produced dataset is drawn with repetition from initial data, 
           and therefore can contain duplicates Depending on n_batches, it will do either 
           undersampling, oversampling or combination of both
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed text 
                     as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels (label != 0 is assumed to be "interesting" )
                 n_batches - integer, number of batches in dataset (so the number of samples 
                             in dataset is equal to n_batches * batch_size = len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th batch 
                            of inputs and dataset[i]['labels'] - corresponding batch of labels"""
        dataset = []
        n_subbatches = n_batches // 2
        
        mask = (y == 1)
        dataset += self.CreateDatasetFromXY(X[mask], y[mask], n_subbatches)
        
        mask = (y == 0)
        dataset += self.CreateDatasetFromXY(X[mask], y[mask], n_subbatches)
        
        return shuffle(dataset)
    
    def CreateDatasetFromXY(self, X, y, n_batches):
        """
        This functions constructs and returns a dataset (a list of batched samples 
        with corresponding labels). 
        
          INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                     text as elements
                 y - two dimensional np.array of shape (n_samples, n_labels) with 
                     classification labels
                 n_batches - integer, number of batches in dataset (so the number 
                             of samples in dataset is equal to n_batches * batch_size = 
                             len(dataset) * batch_size)
          OUTPUT:
                  dataset - list of dictionaries where dataset[i]['input'] is a i-th 
                            batch of inputs and dataset[i]['labels'] - corresponding 
                            batch of labels
        
        """
        # we sort our samples on the lenght of the text (in the number of tokens) and 
        # place texts of the same lenght in the same position in this dictionary. 
        # This can be also viewed as a hash-table
        Len_table = dict()
        for i in range(len(X)):
            codes, cl_features = self.encode(X[i])
            if(Len_table.get(len(codes)) != None):
                Len_table[len(codes)].append((codes, cl_features, y[i]))
            else: 
                Len_table[len(codes)] = [(codes, cl_features, y[i])]
        
        # we have different number of samples of different lenght. There is a lot more 
        # samples of lenght ~10-50 tokens and much smaller number of samples of lenght 
        # 100+ tokens. Now we will get a distribution of number of samples:
        dist = np.array([[i, len(Len_table[i])] for i in Len_table.keys()])
        # here dist[i, 0] is some lenght of sample we encountered in dataset
        # and dist[i, 1] is a number of samples of that lenght 
        
        p = dist[:, 1] / np.sum(dist[:, 1])
        
        # we will construct actual dataset, randomly drawing samples from that distribution:
        dataset = []
        for _ in range(n_batches):
            i = np.random.choice(dist[:, 0], p=p)
            sample_indices = np.random.randint(0, len(Len_table[i]), self.batch_size)
            # it took me some time to figure out correct transformation from mess of 
            # lists and numpy array to torch tensor :)
            if(self.use_cuda):
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False).cuda(),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False).cuda(),
                         'labels':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False).cuda()}
            else:
                batch = {'input':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 0].tolist())), 
                    requires_grad=False),
                         'cl_features':Variable(torch.FloatTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 1].tolist())), 
                    requires_grad=False),
                         'labels':Variable(torch.LongTensor(
                    np.array(np.array(Len_table[i])[sample_indices][:, 2].tolist())), 
                    requires_grad=False)}
                
            dataset.append(batch)        
        
        return dataset
    
    def CreateTokenVocab(self, X, y):
        '''This function generates a word_to_id dictionary we use for encoding text
        
            INPUT: X - one dimensional np.array of shappe (n_samples, ) with unparsed 
                       text as elements
                   y - two dimensional np.array of shape (n_samples, n_labels) with 
                       classification labels (label != 0 is assumed to be "interesting" - 
                       we prioretize tokens encoundered in examples with at least one label = 1)
        
        '''
        token_freq = dict()

        # firstly we exctract all tokens we see in positivly labeled samples
        X_relevant = X[y == 1] 
        X_relevant += shuffle(X[y == 0])[:len(X_relevant)] 
        # we add random portion of "all-negative" data of equal size 
         
        for text in X_relevant:
            tokens = self.Smart_Split(text)

            for token in tokens:
                if(token_freq.get(token) == None):
                    token_freq[token] = 1
                else: token_freq[token] += 1

        tokens = sorted(token_freq, key=token_freq.get)[::-1]

        # secondly, we assign id's to the most frequently encountered tokens in positivly 
        # classified samples
        self.word_to_id = dict()
        for i in range(self.vocab_size - 1):
            self.word_to_id[tokens[i]] = i

        # finally, we would like to find very similar tokens and assign to them the 
        # same id (those are mainly misspells and parsing 
        # innacuracies. For example 'training', 'traning', 'trainnin', 'training"' and so on)
        vec = TfidfVectorizer()
        vec_tokens = vec.fit_transform(tokens)
        same_tokens = ((vec_tokens * vec_tokens.T) > 0.99)
        rows, cols = same_tokens.nonzero()

        for token_pair in zip(rows, cols):
            if(token_pair[0] > self.vocab_size):
                break
            if(token_pair[0] <= token_pair[1]):
                continue
            else:
                self.word_to_id[tokens[token_pair[1]]] = token_pair[0]
    
    def Smart_Split(self, text):
        """Parsing function 
            INPUT: text - python string with any text
            OUTPUT: list of strings, containing tokens
        """
        out = text.strip().lower().replace('\n', ' ')
        out = out.replace(',', ' , ').replace('.', ' . ').replace('!', ' ! ').replace('?', ' ? ')
        out = out.replace(')', ' ) ').replace('(', ' ( ').replace(':', ' : ').replace(';', ' ; ')
        out = out.replace('.  .  .', '...')
        return out.split()

    
    def __getitem__(self, i):
        if(self.mode == 'train'):
            return self.train_dataset[i]
        elif(self.mode == 'test'):
            return self.test_dataset[i]
        elif(self.mode == 'valid'):
            return self.valid_dataset[i]
    
    def __len__(self):
        if(self.mode == 'train'):
            return len(self.train_dataset)
        elif(self.mode == 'test'):
            return len(self.test_dataset)
        elif(self.mode == 'valid'):
            return len(self.valid_dataset)

    def shuffle(self):
        """shuffles dataset, corresponding to current mode"""
        if(self.mode == 'train'):
            self.train_dataset = shuffle(self.train_dataset)
        elif(self.mode == 'test'):
            self.test_dataset = shuffle(self.test_dataset)
        elif(self.mode == 'valid'):
            self.valid_dataset = shuffle(self.valid_dataset)
        

In [255]:
class ClassifierBinary(nn.Module):
    def __init__(self, 
                 label_index,
                 vocab_size=2000, 
                 embedding_dim = 100, 
                 hidden_dim=200, 
                 batch_size=6, 
                 conv_channels=32, 
                 use_cuda=True,
                 num_of_cl_features=4):
        
        super(ClassifierBinary, self).__init__()
        """
            A model from paper "A Convolutional Attention Model for Text Classification" 
            by Jiachen Du, Lin Gui, Ruifeng Xu, Yulan He 
            http://tcci.ccf.org.cn/conference/2017/papers/1057.pdf
            With added character level features
            
        """
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.conv_channels = conv_channels
        self.use_cuda = use_cuda
        self.num_of_cl_features = num_of_cl_features
        
        self.label_index = label_index
        
        if(self.use_cuda):
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim).cuda()
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True).cuda()
                 # // 2 is because we would like to concat hidden states, 
                # calculated from both sides of LSTM and aquire exactly hidden_dim
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2).cuda()
    
            self.linear = nn.Linear(conv_channels, 1).cuda()
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 2).cuda()
        else:
            self.embeddings = nn.Embedding(vocab_size, embedding_dim=embedding_dim)
            self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim // 2, 
                            num_layers=1, bidirectional=True, batch_first=True)
            
            self.conv = nn.Conv1d(in_channels=embedding_dim, 
                                  out_channels=conv_channels, 
                                  kernel_size=5, 
                                  padding=2)
    
            self.linear = nn.Linear(conv_channels, 1)
            self.linear_final = nn.Linear(hidden_dim + num_of_cl_features, 2)
            
        self.init_hidden()
        
        
    def init_hidden(self):
        if(self.use_cuda):
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda(), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)).cuda())
        else:
            self.hidden = (Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)), 
                           Variable(torch.zeros(2, 
                                                self.batch_size, 
                                                self.hidden_dim // 2)))
    
    def forward(self, input_seq, cl_features=None, batched=True):
        
        embed = self.embeddings(input_seq)
            
        if(batched):
            output, _ = self.lstm(embed, self.hidden)
        else:
            output, _ = self.lstm(torch.cat((embed, )*6), self.hidden)
        
        output = output[:1, :, :]
        conv_out = self.conv(embed.permute(0, 2, 1))
        
        attention_tensor = torch.mean(conv_out, dim=1)
        
        features = torch.sum(output * attention_tensor.resize(attention_tensor.data.shape[0], attention_tensor.data.shape[1], 1), dim=1)
        
        if(cl_features is not None and self.num_of_cl_features == cl_features.data.shape[1]):
            features = torch.cat((features, cl_features), dim=1)
        elif(cl_features is not None and self.num_of_cl_features != cl_features.data.shape[1]):
            print("""Recieved unexpected number of character level features. 
                     Model expected to recieve {} features, but received {}. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features, cl_features.data.shape[1]))
            raise ValueError()
        elif(cl_features is None and self.num_of_cl_features > 0):
            print("""Model expected to recieve {} features, but received None. 
                     Check model constructor or sample passed in forward""".format(self.num_of_cl_features))
            raise ValueError()
            
        predictions = nn.functional.softmax(self.linear_final(features), dim=1)
        
        return predictions
    
    def train_ (self, 
               n_train_batches=2000,
               n_valid_batches=500,
               lr = 1e-3,
               weight_decay = 1e-5,
               epochs = 15,
               fitted_clf = None):
        
        self.dataset = ToxicTextsDatasetBinary(self.label_index, 
                                          n_train_batches=n_train_batches, 
                                          n_valid_batches=n_valid_batches,
                                              use_cuda = self.use_cuda)
                   
        optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

        if(self.use_cuda): loss_function = nn.CrossEntropyLoss().cuda()
        else: loss_function = nn.CrossEntropyLoss()
            
            
        train_stats = {'train_losses':[], 'valid_losses':[], 'val_f1_scores':[]}
        
        train_stats['train_losses'].append(0)
        train_stats['valid_losses'].append(0)
        train_stats['val_f1_scores'].append(0)

        start = time.time()
        
        
        for i in range(epochs):
            all_predictions = torch.zeros(1)
            all_true_labels = torch.zeros(1)

            for mode in ['train', 'valid']:
                dataset.mode = mode
                self.dataset.shuffle()
                for sample in self.dataset:
                    if(mode == 'train'):
                        optimizer.zero_grad()

                        self.init_hidden()
                        X = sample['input']
                        features = sample['cl_features']

                        if fitted_clf is not None:
                            for clf in fitted_clf:
                                prediction = clf.forward(X, features).data[:, 1]
                                features = torch.cat((features, Variable(prediction.expand((1,batch_size)).transpose(0,1))), 
                                                     dim =1)

                        pred = self.forward(X, features)
                        loss = loss_function(pred, sample['labels'])
                        loss.backward()
                        optimizer.step()
                        train_stats['train_losses'][-1] += loss.data[0]
                    else:
                        self.init_hidden()
                        X = sample['input']
                        features = sample['cl_features']

                        if fitted_clf is not None:
                            for i, clf in enumerate(fitted_clf):
                                prediction = clf.forward(X, features).data[:, 1]
                                features = torch.cat((features, Variable(prediction.expand((1,batch_size)).transpose(0,1))), 
                                                     dim =1)
                        pred = self.forward(X, features)
                        train_stats['valid_losses'][-1] += loss_function(pred, sample['labels']).data[0]

                        _, pred = torch.max(pred.data, 1)
                        
                        all_predictions = torch.cat((all_predictions, torch.FloatTensor(pred.cpu().numpy())))
                        all_true_labels = torch.cat((all_true_labels, torch.FloatTensor(
                                                                        sample['labels'].data.cpu().numpy())))


            all_predictions = all_predictions.numpy()
            all_true_labels = all_true_labels.numpy()

            all_predictions = (all_predictions - 0.5 > 0).astype(int)

            train_stats['val_f1_scores'][-1] = roc_auc_score(all_true_labels, all_predictions)
            
            
            print('Epoch {:03d}; train loss = {:4.2f}; validation loss = {:2.2f}; validation roc_auc_score = {:0.2f}; ETA = {:3.0f} s'.format(i, 
                                                                             train_stats['train_losses'][-1], 
                                                                             train_stats['valid_losses'][-1], 
                                                                             train_stats['val_f1_scores'][-1],
                                                                            (epochs - i)*(time.time() - start)/(i+1)))
            train_stats['train_losses'].append(0)
            train_stats['valid_losses'].append(0)
            train_stats['val_f1_scores'].append(0)
        
        return train_stats
    
    
    
    def predict(self, X, features=None, encoded=False, fitted_clf = None):
        
        if encoded:
            if fitted_clf is not None:
                for clf in fitted_clf:
                    prediction = clf.forward(X, features).data[:, 1]
                    features = torch.cat((features, Variable(prediction.expand((1,batch_size)).transpose(0,1))), dim =1)
            Input = X
            cl_features = features
            pred = self.forward(Input, cl_features, batched=True)
            
        else:
            codes, cl_features = self.dataset.encode(X)

            if self.use_cuda:
                Input = Variable(torch.LongTensor(np.array(codes)), 
                                 requires_grad=False).resize(1, len(codes)).cuda()
                features = Variable(torch.FloatTensor(np.array(cl_features)), 
                                       requires_grad=False).resize(1, len(cl_features)).cuda()
            else:
                Input = Variable(torch.LongTensor(np.array(codes)), 
                                 requires_grad=False).resize(1, len(codes))
                features = Variable(torch.FloatTensor(np.array(cl_features)), 
                                       requires_grad=False).resize(1, len(cl_features))
            
            if fitted_clf is not None:
                for clf in fitted_clf:
                    prediction = clf.forward(Input, features, batched=False).data[:, 1]
                    features = torch.cat((features, Variable(prediction.expand((1,1)).transpose(0,1))), dim =1)
            
            cl_features = features
            pred = self.forward(Input, cl_features, batched=False)
                       
#         _, pred = torch.max(pred.data, 1)
        
        return pred.data[:, 1]
    

In [257]:
class ChainBinary(nn.Module):
    def __init__(self, 
                 order = None,
                 vocab_size=2000, 
                 embedding_dim = 100, 
                 hidden_dim=200, 
                 batch_size=6, 
                 conv_channels=32, 
                 use_cuda=True,
                 num_of_cl_features=4,
                 epochs = [3,3,3,3,4,5]):
        super(ChainBinary, self).__init__()
        
        
        
        if order is None:
            self.order_ = np.random.permutation(range(6))
        else:
            self.order_ = order
            
        self.use_cuda = use_cuda
        self.epochs_ = epochs
        self.all_clfs = [] # All currently fitted binary classifiers
        
        self.all_clfs.append(ClassifierBinary(self.order_[0], vocab_size, embedding_dim, hidden_dim, 
                                     batch_size, conv_channels, self.use_cuda, num_of_cl_features))
        
        
         
    def train_(self, 
           n_train_batches=10,
           n_valid_batches=0,
           lr = 1e-3,
           weight_decay = 1e-5):
    
        train_stat = []
        print('Training binary classifier for target {}...'.format(self.order_[0]))
        train_stat.append(self.all_clfs[0].train_(n_train_batches, n_valid_batches, lr, 
                                                  weight_decay, self.epochs_[self.order_[0]]))
       
        for i in range(1,6):
            print('Training binary classifier for target  {}...'.format(self.order_[i]))
            self.all_clfs.append(ClassifierBinary(self.order_[i], vocab_size, embedding_dim, hidden_dim, 
                                     batch_size, conv_channels, self.use_cuda, num_of_cl_features+i))
            train_stat.append(self.all_clfs[-1].train_(n_train_batches, n_valid_batches, lr, weight_decay, 
                                                       self.epochs_[self.order_[i]], self.all_clfs[:-1]))
            
        print('Done!')    
        return train_stat
    
    def predict(self, X):
        num = []
        for i in range(6):
            num.append(np.where(self.order_ == i)[0][0])
        
        return torch.stack((self.all_clfs[num[0]].predict(X, fitted_clf = self.all_clfs[:num[0]]), 
                            self.all_clfs[num[1]].predict(X, fitted_clf = self.all_clfs[:num[1]]),
                            self.all_clfs[num[2]].predict(X, fitted_clf = self.all_clfs[:num[2]]),
                            self.all_clfs[num[3]].predict(X, fitted_clf = self.all_clfs[:num[3]]),
                            self.all_clfs[num[4]].predict(X, fitted_clf = self.all_clfs[:num[4]]),
                            self.all_clfs[num[5]].predict(X, fitted_clf = self.all_clfs[:num[5]]))).transpose(0, 1)

In [258]:
class EnsembleChains():
    def __init__(self, n_chains=10, use_cuda = True, epochs = [3,3,3,3,4,5]):
        
        self.chains = []
        self.use_cuda = use_cuda
        self.epochs = epochs
        
        for i in tqdm.tqdm_notebook(range(n_chains)):
            
            self.chains.append(ChainBinary(epochs = self.epochs, use_cuda = self.use_cuda))
            self.chains[-1].train_()
            #print('trained {}-th ensemble of binary classifiers!'.format(i))

            del self.chains[-1].all_clfs[0].dataset.train_dataset
            del self.chains[-1].all_clfs[1].dataset.train_dataset
            del self.chains[-1].all_clfs[2].dataset.train_dataset
            del self.chains[-1].all_clfs[3].dataset.train_dataset
            del self.chains[-1].all_clfs[4].dataset.train_dataset
            del self.chains[-1].all_clfs[5].dataset.train_dataset
            
    def predict(self, X):
        
        predictions = []
        
        for chain in self.chains:
            predictions.append(chain.predict(X).cpu().numpy()[0])
        
        return np.mean(predictions, axis=0)

In [1]:
EC = EnsembleChains(use_cuda = True, epochs = [5,5,5,5,5,5], n_chains = 20)

In [None]:
df = pd.read_csv('test.csv')
df_new = pd.DataFrame(columns=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
for ID, line in tqdm.tqdm_notebook(zip(df.iloc[:, 0].astype(str), df.iloc[:, 1]), total=len(df)):
    if(line != '\u2003'):
        df_new.loc[ID] = EC.predict(line)[0]
    else:
        df_new.loc[ID] = [0., 0., 0., 0., 0., 0.]

df_new.to_csv('test_predictions.csv')

# DO NOT FORGET TO ADD 'id' TO FILE !

In [262]:
EC.predict('lol kek')

array([ 0.50045449,  0.68106574,  0.20780464,  0.62777185,  0.39182198,
        0.31945559], dtype=float32)