In [15]:
import time
import torch 
import numpy as np
from torch.utils.data import DataLoader, RandomSampler
from transformers import DistilBertTokenizer,DistilBertModel,AdamW, BertForSequenceClassification
from torch.utils.data import TensorDataset
import pandas as pd

In [16]:
print(torch.cuda.is_available())

True


#Data Imports, Cleaning, etc

In [17]:
train_df = pd.read_csv("data/jigsaw-toxic-comment-train.csv")

In [18]:
LABEL_COLUMNS = ["toxic"]
train_toxic = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]
train_clean = train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]
train_toxic = train_toxic.drop(columns = ['severe_toxic','obscene', 'threat', 'insult', 'identity_hate'])
train_clean = train_clean.drop(columns = ['severe_toxic','obscene', 'threat', 'insult', 'identity_hate'])
concat_df = pd.concat([train_toxic, train_clean])
train_data = concat_df.sample(frac=1).reset_index(drop=True)
train_data.head()

Unnamed: 0,id,comment_text,toxic
0,94e9521f28329dd1,:Awesome! Thank you so much!,0
1,19f3078ecbccebeb,Ok but can u at leas block User: Gibraltarian ...,0
2,2dc194f8771b71dc,::Seems prudent. Thanks!,0
3,5daf3e5ba6db326e,"""Also: you seem completely unclear on the conc...",0
4,99f2381ed941124b,There are several definitions of Kinneret - wh...,0


In [19]:
test_labels_df = pd.read_csv('data/test_labels.csv')
test_df = pd.read_csv('data/test.csv')
test_df=test_df.drop(columns='id')
test_concat_df = pd.concat([test_labels_df,test_df], axis=1,join="inner")
test_concat_df = test_concat_df.drop(columns = ['lang'])
test_concat_df['comment_text'] = test_concat_df['content']
test_data = test_concat_df[['id','comment_text','toxic']].copy()
test_data.head()

Unnamed: 0,id,comment_text,toxic
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,0
1,1,"Вполне возможно, но я пока не вижу необходимо...",0
2,2,"Quindi tu sei uno di quelli conservativi , ...",1
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,0
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,0


In [20]:
val_df = pd.read_csv('data/validation.csv')
val_data = val_df.drop(columns = ['lang'])
val_data.head()

Unnamed: 0,id,comment_text,toxic
0,0,Este usuario ni siquiera llega al rango de ...,0
1,1,Il testo di questa voce pare esser scopiazzato...,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,0


In [83]:
val_data = val_data[:2000]
val_data.shape

(2000, 3)

In [84]:
train_data = train_data[:8000]
train_data.shape

(8000, 3)

In [85]:
test_data = test_data[:3000]
test_data.shape

(3000, 3)

In [86]:
test_data.head()

Unnamed: 0,id,comment_text,toxic
0,0,Doctor Who adlı viki başlığına 12. doctor olar...,0
1,1,"Вполне возможно, но я пока не вижу необходимо...",0
2,2,"Quindi tu sei uno di quelli conservativi , ...",1
3,3,Malesef gerçekleştirilmedi ancak şöyle bir şey...,0
4,4,:Resim:Seldabagcan.jpg resminde kaynak sorunu ...,0


In [87]:
val_data.head()

Unnamed: 0,id,comment_text,toxic
0,0,Este usuario ni siquiera llega al rango de ...,0
1,1,Il testo di questa voce pare esser scopiazzato...,0
2,2,Vale. Sólo expongo mi pasado. Todo tiempo pasa...,1
3,3,Bu maddenin alt başlığı olarak uluslararası i...,0
4,4,Belçika nın şehirlerinin yanında ilçe ve belde...,0


In [88]:
train_data.head()

Unnamed: 0,id,comment_text,toxic
0,148052a1f780a1e7,::That doesn't make even the slightest bit of ...,0
1,d1ebc8b7169bb282,"You know, if you don't tell me what I did wron...",0
2,ef63757759cfa1aa,Belief? \n\nIs RadioKirk the onlyone who beliv...,0
3,2c38f650668e090a,"""\n\n I suggest that you search the article fo...",0
4,e15f2f0010f19e5d,oh lol eleanor is a pure and utter gimp she li...,0


In [89]:
test_data.dtypes

id               int64
comment_text    object
toxic            int64
dtype: object

In [90]:
print(test_data.shape,
train_data.shape,
val_data.shape)

(3000, 3) (8000, 3) (2000, 3)


In [91]:
df1 = train_data
comments = df1.comment_text.values
labels = df1.toxic.values

print(comments), print(labels)

["::That doesn't make even the slightest bit of sense. That you think it looks shitty is not a justification for ignoring rules, so of course I wouldn't prefer it. You are flat wrong on this. It is more important the the text of quotations not be misleading than that Hell in a Bucket like the way something looks. It's weird enough that you refuse to understand the rather obvious reason why the brackets were there, but that you edit warred over it is incomprehensible. It's understandable when someone edit wars to try to enforce a guideline, but to consciously do so for no more reason than you don't like the guideline is hard to understand. -"
 "You know, if you don't tell me what I did wrong, I don't see what I can learn from it.76.10.75.168"
 'Belief? \n\nIs RadioKirk the onlyone who belives me???' ...
 '"\n\nLifespan\nPossibly this entry at the SSDI - born August 6, 1899, died January 1983.  Paul "'
 "Why bother with signing my name when you temporarily blocked me from articles on whi

(None, None)

In [92]:
df2 = val_data
comments2 = df2.comment_text.values
labels2 = df2.toxic.values

print(comments2), print(labels2)

['Este usuario ni siquiera llega al rango de    hereje   . Por lo tanto debería ser quemado en la barbacoa para purificar su alma y nuestro aparato digestivo mediante su ingestión.    Skipe linkin 22px   Honor, valor, leltad.      17:48 13 mar 2008 (UTC)'
 'Il testo di questa voce pare esser scopiazzato direttamente da qui. Immagino possano esserci problemi di copyright, nel fare cio .'
 'Vale. Sólo expongo mi pasado. Todo tiempo pasado fue mejor, ni mucho menos, yo no quisiera retroceder 31 años a nivel particular. Las volveria a pasar putas.Fernando '
 ...
 'Supongo que eso de que eres un adicto al porno, es un vandalismo hecho por otro. '
 'Que caciqueros, mesetarios, incultos y prepotentes...a mi me parecéis todos una banda de bastardos...Lo importante es joder, da igual la razón...porque tenéis los tanques, que si no. Ya lo dejaís bien clarito en muchas ocasiones que aunque imbéciles ( y lo de imbéciles queda reflejado dia tras dia en su mierda de Pais, dónde aun tienen un preside

(None, None)

In [93]:
df3 = test_data
test_comments = df3.comment_text.values
test_labels = df3.toxic.values

print(test_comments), print(test_labels)

['Doctor Who adlı viki başlığına 12. doctor olarak bir viki yazarı kendi adını eklemiştir. Şahsen düzelttim. Onaylarsanız sevinirim. Occipital '
 ' Вполне возможно, но я пока не вижу необходимости выделять материал в отдельную статью. Если про правосудие в СССР будет написано хотя бы килобайт 20-30 — тогда да, следует разделить. Пока же мы в итоге получим одну куцую статью Правосудие и другую не менее куцую статью Правосудие в СССР. Мне кажется, что этот вопрос вполне разумно решать на основе правил ВП:Размер статей? которые не предписывают разделения, пока размер статьи не достигнет хотя бы 50 тыс. знаков. '
 'Quindi tu sei uno di quelli   conservativi  , che preferiscono non cancellare. Ok. Avresti lasciato anche   sfaccimma  ? Si? Ok. Contento te... io non approvo per nulla, ma non conto nemmeno nulla... Allora lo sai che faccio? Me ne frego! (Aborro il fascismo, ma quando ce vo , ce vo !) Elborgo (sms) '
 ...
 'Polluer? ça fait des semaines que j ai bossé sur l article, le débat es

(None, None)

In [94]:
#MAX_LEN = 512
#TRAIN_BATCH_SIZE = 1
#VALID_BATCH_SIZE = 2
#EPOCHS = 1
#LEARNING_RATE = 1e-05
bert_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [95]:
print('Original: ',  comments[0])
print('Tokenized: ', tokenizer.tokenize(comments[0]))
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(comments[0])))

Original:  ::That doesn't make even the slightest bit of sense. That you think it looks shitty is not a justification for ignoring rules, so of course I wouldn't prefer it. You are flat wrong on this. It is more important the the text of quotations not be misleading than that Hell in a Bucket like the way something looks. It's weird enough that you refuse to understand the rather obvious reason why the brackets were there, but that you edit warred over it is incomprehensible. It's understandable when someone edit wars to try to enforce a guideline, but to consciously do so for no more reason than you don't like the guideline is hard to understand. -
Tokenized:  [':', ':', 'That', 'doesn', "'", 't', 'make', 'even', 'the', 'sl', '##ight', '##est', 'bit', 'of', 'sense', '.', 'That', 'you', 'think', 'it', 'looks', 'shi', '##tty', 'is', 'not', 'a', 'just', '##ification', 'for', 'ig', '##nor', '##ing', 'rules', ',', 'so', 'of', 'course', 'I', 'would', '##n', "'", 't', 'pre', '##fer', 'it', '

In [96]:
"""
input_ids = []
attention_masks = []

class ToxicCommentsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len): #: DistilBertTokenizer
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.len=len(data)

    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, comments):#, item):
        #comment_text = str(self.data.comment_text[item])
        print(comments)
        encoding = self.tokenizer.encode_plus(
        comments, 
        None,
        add_special_tokens=True,
        max_length=self.max_len, 
        return_token_type_ids=True,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
        #ids = encoding['input_ids']
        #mask = encoding['attention_mask']

        input_ids.append(encoding['input_ids']) 
        attention_masks.append(encoding['attention_mask'])
    """

'\ninput_ids = []\nattention_masks = []\n\nclass ToxicCommentsDataset(Dataset):\n    def __init__(self, data, tokenizer, max_len): #: DistilBertTokenizer\n        self.data = data\n        self.tokenizer = tokenizer\n        self.max_len = max_len\n        self.len=len(data)\n\n    def __len__(self):\n        return len(self.data)\n        \n    def __getitem__(self, comments):#, item):\n        #comment_text = str(self.data.comment_text[item])\n        print(comments)\n        encoding = self.tokenizer.encode_plus(\n        comments, \n        None,\n        add_special_tokens=True,\n        max_length=self.max_len, \n        return_token_type_ids=True,\n        padding="max_length",\n        truncation=True,\n        return_attention_mask=True,\n        return_tensors=\'pt\',\n    )\n        #ids = encoding[\'input_ids\']\n        #mask = encoding[\'attention_mask\']\n\n        input_ids.append(encoding[\'input_ids\']) \n        attention_masks.append(encoding[\'attention_mask\'])\n 

In [97]:
# train_size = 0.8
# train_dataset = train_data.sample(frac=train_size, random_state=123)
# test_dataset = test_data.drop(test_data.index).reset_index(drop=True)
# train_dataset = train_dataset.reset_index(drop=True)
"""
training_set = ToxicCommentsDataset(train_data,tokenizer,MAX_LEN)
validation_set = ToxicCommentsDataset(val_data,tokenizer,MAX_LEN)
testing_set = ToxicCommentsDataset(test_data,tokenizer,MAX_LEN)


print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))
print("VAL Dataset: {}".format(val_data.shape))
"""

'\ntraining_set = ToxicCommentsDataset(train_data,tokenizer,MAX_LEN)\nvalidation_set = ToxicCommentsDataset(val_data,tokenizer,MAX_LEN)\ntesting_set = ToxicCommentsDataset(test_data,tokenizer,MAX_LEN)\n\n\nprint("TRAIN Dataset: {}".format(train_data.shape))\nprint("TEST Dataset: {}".format(test_data.shape))\nprint("VAL Dataset: {}".format(val_data.shape))\n'

In [98]:
def maketensors(dataset,input_ids_list,attention_masks_list):
    
    for data in dataset:
        encoded_dict = tokenizer.encode_plus(
                        comment,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
        input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

In [99]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for comment in comments:
    encoded_dict = tokenizer.encode_plus(
                        comment,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', comments[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  ::That doesn't make even the slightest bit of sense. That you think it looks shitty is not a justification for ignoring rules, so of course I wouldn't prefer it. You are flat wrong on this. It is more important the the text of quotations not be misleading than that Hell in a Bucket like the way something looks. It's weird enough that you refuse to understand the rather obvious reason why the brackets were there, but that you edit warred over it is incomprehensible. It's understandable when someone edit wars to try to enforce a guideline, but to consciously do so for no more reason than you don't like the guideline is hard to understand. -
Token IDs: tensor([  101,   131,   131, 13646, 47798,   112,   188, 13086, 13246, 10105,
        38523, 27521, 13051, 17684, 10108, 15495,   119, 13646, 13028, 27874,
        10271, 59148, 57667, 30921, 10124, 10472,   169, 12820, 29748, 10142,
        23602, 36064, 10230, 23123,   117, 10380, 10108, 15348,   146, 10894,
        10115,   11

In [100]:
input_ids.shape

torch.Size([8000, 64])

In [101]:
attention_masks.shape

torch.Size([8000, 64])

In [102]:
labels.shape

torch.Size([8000])

In [103]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
val_input_ids = []
val_attention_masks = []

# For every sentence...
for comment in comments2:
    encoded_dict = tokenizer.encode_plus(
                        comment,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    # Add the encoded sentence to the list.    
    val_input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    val_attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
val_input_ids = torch.cat(val_input_ids, dim=0)
val_attention_masks = torch.cat(val_attention_masks, dim=0)
val_labels = torch.tensor(labels2)

# Print sentence 0, now as a list of IDs.
print('Original: ', comments2[0])
print('Token IDs:', val_input_ids[0])

Original:  Este usuario ni siquiera llega al rango de    hereje   . Por lo tanto debería ser quemado en la barbacoa para purificar su alma y nuestro aparato digestivo mediante su ingestión.    Skipe linkin 22px   Honor, valor, leltad.      17:48 13 mar 2008 (UTC)
Token IDs: tensor([  101, 12515, 82849, 10414, 10294, 39190, 10113, 39492, 10164, 39715,
        10104, 19353, 10381,   119, 12399, 10406, 12921, 96621, 10493, 29826,
        11272, 10110, 10109, 18121, 10537, 83592, 10220, 32385, 66240, 10198,
        39215,   193, 75036, 32500, 18010, 80592, 32413, 11244, 18229, 10198,
        11600, 32413, 11482,   119, 51874, 11355, 26192, 10245, 10306, 10410,
        10686, 26354,   117, 18094,   117, 10141, 92608,   119, 10273,   131,
        11300, 10249, 12318,   102])


In [104]:
val_attention_masks.shape

torch.Size([2000, 64])

In [105]:
val_input_ids.shape

torch.Size([2000, 64])

In [106]:
val_labels.shape

torch.Size([2000])

In [107]:
test_input_ids = []
test_attention_masks = []

# For every sentence...
for comment in test_comments:
    encoded_dict = tokenizer.encode_plus(
                        comment,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    test_input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    test_attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)
test_labels = torch.tensor(test_labels)

print('Original: ', test_comments[0])
print('Token IDs:', test_input_ids[0])

Original:  Doctor Who adlı viki başlığına 12. doctor olarak bir viki yazarı kendi adını eklemiştir. Şahsen düzelttim. Onaylarsanız sevinirim. Occipital 
Token IDs: tensor([   101,  17376,  14516,  19165,  56324,  10116,  24542,  91727,  10186,
           119,  26937,  11772,  10561,  56324,  10116,  82867,  10713,  32720,
         42702,  16334,  19343,  61716,  18330,    119, 102884,  10917,    172,
         78653,  12683,  10147,    119,  44798,  82350,  14434,  30471,  10126,
         60906,  23760,    119,    152,  28217,  55743,    102,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0])


In [108]:
"""
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)
testing_loader = DataLoader(testing_set, **test_params)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
"""

"\ntrain_params = {'batch_size': TRAIN_BATCH_SIZE,\n                'shuffle': True,\n                'num_workers': 0\n                }\n\n\nval_params = {'batch_size': VALID_BATCH_SIZE,\n                'shuffle': True,\n                'num_workers': 0\n                }\n\n\ntest_params = {'batch_size': VALID_BATCH_SIZE,\n                'shuffle': True,\n                'num_workers': 0\n                }\n\ntraining_loader = DataLoader(training_set, **train_params)\nvalidation_loader = DataLoader(validation_set, **val_params)\ntesting_loader = DataLoader(testing_set, **test_params)\n\n# Set the batch size.  \nbatch_size = 32  \n\n# Create the DataLoader.\nprediction_data = TensorDataset(input_ids, attention_masks, labels)\nprediction_sampler = SequentialSampler(prediction_data)\nprediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)\n"

In [109]:
dataset = TensorDataset(input_ids, attention_masks, labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
test_dataset = TensorDataset(test_input_ids,test_attention_masks,test_labels)

train_dataloader = DataLoader(
            dataset,  # The training samples.
            sampler = RandomSampler(dataset), # Select batches randomly
            batch_size = 16 # Trains with this batch size.
        )

validation_dataloader = DataLoader(
            val_dataset,  # The training samples.
            sampler = RandomSampler(val_dataset), # Select batches randomly
            batch_size = 16 # Trains with this batch size.
        )

testing_dataloader = DataLoader(
            test_dataset,  # The training samples.
            sampler = RandomSampler(test_dataset), # Select batches randomly
            batch_size = 32 # Trains with this batch size.
        )

In [110]:
print(type(input_ids),type(attention_masks),type(labels))

<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [111]:
print(type(val_input_ids),type(val_attention_masks),type(val_labels))

<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [112]:
for batch in validation_dataloader:
    print(batch[0])

tensor([[  101, 42272, 42998,  ..., 10310, 84572,   102],
        [  101, 11045, 61197,  ..., 64424, 28581,   102],
        [  101, 33127, 10123,  ..., 10143, 10549,   102],
        ...,
        [  101, 64831, 11244,  ...,   114, 69579,   102],
        [  101, 11045, 10428,  ...,     0,     0,     0],
        [  101, 31301, 48985,  ..., 10305,   117,   102]])
tensor([[  101, 13740, 22037,  ..., 12216, 10369,   102],
        [  101,   246,   151,  ..., 59879, 11273,   102],
        [  101, 11045, 11381,  ..., 24446, 10343,   102],
        ...,
        [  101, 75294, 10133,  ...,     0,     0,     0],
        [  101, 11916, 13785,  ..., 10104, 81695,   102],
        [  101, 30247, 29392,  ...,   119,   102,     0]])
tensor([[   101,  11045,  10228,  ..., 107054,  10125,    102],
        [   101,  10734, 100025,  ...,  18938,  79118,    102],
        [   101,  10734, 100025,  ...,      0,      0,      0],
        ...,
        [   101,  42272,  42998,  ...,    119,  12845,    102],
       

In [113]:
print(len(validation_dataloader))

125


In [114]:
print(val_dataset)

<torch.utils.data.dataset.TensorDataset object at 0x000001592A9452E0>


In [115]:
print(len(train_dataloader))

500


In [116]:
"""
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        print("A")
        inp = tokenizer("Hello, my dog is cute", return_tensors="pt")
        output_1 = self.l1(**inp)#(input_ids, attention_mask)
        print("B")
        hidden_state = output_1[0]
        print("C")
        pooler = hidden_state[:, 0]
        print("D")
        pooler = self.pre_classifier(pooler)
        print("E")
        pooler = torch.nn.ReLU()(pooler)
        print("F")
        pooler = self.dropout(pooler)
        print("G")
        output = self.classifier(pooler)
        print("H")
        print("down")
        print("output: ", output)
        return output
    """

'\nclass DistillBERTClass(torch.nn.Module):\n    def __init__(self):\n        super(DistillBERTClass, self).__init__()\n        self.l1 = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")\n        self.pre_classifier = torch.nn.Linear(768, 768)\n        self.dropout = torch.nn.Dropout(0.3)\n        self.classifier = torch.nn.Linear(768, 2)\n\n    def forward(self, input_ids, attention_mask):\n        print("A")\n        inp = tokenizer("Hello, my dog is cute", return_tensors="pt")\n        output_1 = self.l1(**inp)#(input_ids, attention_mask)\n        print("B")\n        hidden_state = output_1[0]\n        print("C")\n        pooler = hidden_state[:, 0]\n        print("D")\n        pooler = self.pre_classifier(pooler)\n        print("E")\n        pooler = torch.nn.ReLU()(pooler)\n        print("F")\n        pooler = self.dropout(pooler)\n        print("G")\n        output = self.classifier(pooler)\n        print("H")\n        print("down")\n        print("output: ",

In [117]:
torch.cuda.set_device(0)

In [118]:
device = torch.device("cpu")
#model = DistilBertModel.from_pretrained("distilbert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.to(device)

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['distilbert.transformer.layer.3.attention.v_lin.weight', 'distilbert.transformer.layer.5.output_layer_norm.weight', 'distilbert.embeddings.LayerNorm.weight', 'distilbert.transformer.layer.2.attention.q_lin.weight', 'distilbert.transformer.layer.5.ffn.lin2.bias', 'distilbert.transformer.layer.3.attention.k_lin.weight', 'distilbert.transformer.layer.3.attention.v_lin.bias', 'distilbert.transformer.layer.3.output_layer_norm.bias', 'distilbert.transformer.layer.1.attention.k_lin.weight', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'distilbert.transformer.layer.2.attention.v_lin.bias', 'distilbert.transformer.layer.1.output_layer_norm.weight', 'distilbert.transformer.layer.4.sa_layer_norm.bias', 'd

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [119]:
loss_function = torch.nn.BCELoss()
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [120]:
#def calcuate_accu(big_idx, targets): #calculate accuracy of the model
    #n_correct = (big_idx==targets).sum().item()
    #return n_correct

In [121]:
"""
#def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        print(list(data.keys()))
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)
        # ids = data['ids'].cuda().long()
        # mask = data['mask'].cuda().long()
        # targets = data['targets'].cuda().long()
        outputs = model(ids, mask)
        loss = loss_function(outputs,targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        #nb_tr_steps += 1
        #nb_tr_examples+=targets.size(0)
        
        #if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 
"""

'\n#def train(epoch):\n    tr_loss = 0\n    n_correct = 0\n    nb_tr_steps = 0\n    nb_tr_examples = 0\n    model.train()\n    for _,data in enumerate(training_loader, 0):\n        print(list(data.keys()))\n        ids = data[\'ids\'].to(device, dtype = torch.long)\n        mask = data[\'mask\'].to(device, dtype = torch.long)\n        targets = data[\'targets\'].to(device, dtype = torch.long)\n        # ids = data[\'ids\'].cuda().long()\n        # mask = data[\'mask\'].cuda().long()\n        # targets = data[\'targets\'].cuda().long()\n        outputs = model(ids, mask)\n        loss = loss_function(outputs,targets)\n        tr_loss += loss.item()\n        big_val, big_idx = torch.max(outputs.data, dim=1)\n        n_correct += calcuate_accu(big_idx, targets)\n\n        #nb_tr_steps += 1\n        #nb_tr_examples+=targets.size(0)\n        \n        #if _%5000==0:\n            loss_step = tr_loss/nb_tr_steps\n            accu_step = (n_correct*100)/nb_tr_examples \n            print(f"Tra

In [122]:
"""
for epoch in range(3):
    epoch_loss = 0.0
    for (b_ix, batch) in enumerate(train_dataloader):
      print(batch)
      optimizer.zero_grad()
      ids = batch['input_ids']        # tensor
      mask = batch[1]  # tensor
      lbls = batch[2]               # tensor
      outputs = model(ids,attention_mask=mask, targets=lbls)
      loss = outputs[0]
      epoch_loss += loss.item()  # accumulate batch loss
      loss.backward()
      optimizer.step()
      if b_ix % 5 == 0:  # 200 items is 20 batches of 10
        print(" batch = %5d curr batch loss = %0.4f " % \
        (b_ix, loss.item()))
      # if b_ix >= xx: break  # to save time for demo
    print("end epoch = %4d  epoch loss = %0.4f " % \
      (epoch, epoch_loss))
    print("Training complete ")
"""

'\nfor epoch in range(3):\n    epoch_loss = 0.0\n    for (b_ix, batch) in enumerate(train_dataloader):\n      print(batch)\n      optimizer.zero_grad()\n      ids = batch[\'input_ids\']        # tensor\n      mask = batch[1]  # tensor\n      lbls = batch[2]               # tensor\n      outputs = model(ids,attention_mask=mask, targets=lbls)\n      loss = outputs[0]\n      epoch_loss += loss.item()  # accumulate batch loss\n      loss.backward()\n      optimizer.step()\n      if b_ix % 5 == 0:  # 200 items is 20 batches of 10\n        print(" batch = %5d curr batch loss = %0.4f " %         (b_ix, loss.item()))\n      # if b_ix >= xx: break  # to save time for demo\n    print("end epoch = %4d  epoch loss = %0.4f " %       (epoch, epoch_loss))\n    print("Training complete ")\n'

In [123]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [124]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [125]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [126]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [127]:
import random
import numpy as np

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # In PyTorch, calling `model` will in turn call the model's `forward` 
        # function and pass down the arguments. The `forward` function is 
        # documented here: 
        # https://huggingface.co/transformers/model_doc/bert.html#bertforsequenceclassification
        # The results are returned in a results object, documented here:
        # https://huggingface.co/transformers/main_classes/output.html#transformers.modeling_outputs.SequenceClassifierOutput
        # Specifically, we'll get the loss (because we provided labels) and the
        # "logits"--the model outputs prior to activation.
        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoc

    for batch in validation_dataloader:
        
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using 
        # the `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        # Get the loss and "logits" output by the model. The "logits" are the 
        # output values prior to applying an activation function like the 
        # softmax.
        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    40  of    500.    Elapsed: 0:03:58.
  Batch    80  of    500.    Elapsed: 0:07:41.
  Batch   120  of    500.    Elapsed: 0:11:22.
  Batch   160  of    500.    Elapsed: 0:15:02.
  Batch   200  of    500.    Elapsed: 0:18:42.
  Batch   240  of    500.    Elapsed: 0:22:21.
  Batch   280  of    500.    Elapsed: 0:26:00.
  Batch   320  of    500.    Elapsed: 0:29:39.
  Batch   360  of    500.    Elapsed: 0:33:18.
  Batch   400  of    500.    Elapsed: 0:36:57.
  Batch   440  of    500.    Elapsed: 0:40:35.
  Batch   480  of    500.    Elapsed: 0:44:14.

  Average training loss: 0.29
  Training epcoh took: 0:46:03

Running Validation...
  Accuracy: 0.85
  Validation Loss: 0.48
  Validation took: 0:03:02

Training...
  Batch    40  of    500.    Elapsed: 0:03:38.
  Batch    80  of    500.    Elapsed: 0:07:16.
  Batch   120  of    500.    Elapsed: 0:10:55.
  Batch   160  of    500.    Elapsed: 0:14:33.
  Batch   200  of    500.    Elapsed: 0:18:12.
  Batch   240  of    5

In [128]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur.,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.29,0.48,0.85,0:46:03,0:03:02
2,0.2,0.8,0.63,0:45:31,0:03:01
3,0.17,0.59,0.82,0:45:25,0:02:59
4,0.14,0.71,0.78,0:45:23,0:03:00


In [129]:
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns

# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

UsageError: Line magic function `%` not found.


In [130]:
print('Predicting labels for {:,} test sentences...'.format(len(test_input_ids)))

# Put model in evaluation mode
model.eval()
t0 = time.time()
# Tracking variables 
predictions , true_labels = [], []

# Predict 
for step,batch in enumerate(testing_dataloader):
        # Progress update every 40 batches.
  if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
    elapsed = format_time(time.time() - t0)
    print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(testing_dataloader), elapsed))

  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')


Predicting labels for 3,000 test sentences...
  Batch    40  of     94.    Elapsed: 0:01:48.
  Batch    80  of     94.    Elapsed: 0:03:37.
    DONE.


In [131]:
print('Toxic samples: %d of %d (%.2f%%)' % (df3.toxic.sum(), len(df3.toxic), (df3.toxic.sum() / len(df3.toxic) * 100.0)))

Toxic samples: 699 of 3000 (23.30%)


In [132]:
print(predictions)

[array([[ 0.24871208, -0.5706789 ],
       [ 0.9866019 , -1.3104534 ],
       [ 2.6334898 , -2.9709299 ],
       [ 1.1350842 , -1.4524695 ],
       [ 1.9563595 , -2.3189394 ],
       [ 3.08615   , -3.36703   ],
       [ 0.0901994 , -0.39536664],
       [ 3.0230162 , -3.316258  ],
       [ 2.1108963 , -2.4529583 ],
       [ 0.00368916, -0.28003964],
       [ 2.827805  , -3.1599944 ],
       [ 2.5646427 , -2.9203513 ],
       [ 1.5777411 , -1.9322644 ],
       [ 2.0924344 , -2.45437   ],
       [-0.3094897 ,  0.05432946],
       [-0.4735841 ,  0.21962158],
       [ 2.986885  , -3.2851648 ],
       [ 0.74665934, -1.0782636 ],
       [ 2.8025713 , -3.150106  ],
       [ 2.8188126 , -3.1432073 ],
       [ 0.33537692, -0.6387907 ],
       [ 2.4267008 , -2.8125076 ],
       [ 0.5340896 , -0.8449312 ],
       [ 1.6906191 , -2.0481403 ],
       [ 2.4781735 , -2.836065  ],
       [ 1.4660608 , -1.8114294 ],
       [-0.12761389, -0.14030115],
       [ 2.169318  , -2.550057  ],
       [ 1.8786501 

In [133]:
print(len(predictions),len(true_labels))

94 94


In [134]:
print((true_labels[0][0]))
print((true_labels[93]))


1
[0 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1]


In [135]:
print((predictions[0][0]))
print((predictions[0]))


[ 0.24871208 -0.5706789 ]
[[ 0.24871208 -0.5706789 ]
 [ 0.9866019  -1.3104534 ]
 [ 2.6334898  -2.9709299 ]
 [ 1.1350842  -1.4524695 ]
 [ 1.9563595  -2.3189394 ]
 [ 3.08615    -3.36703   ]
 [ 0.0901994  -0.39536664]
 [ 3.0230162  -3.316258  ]
 [ 2.1108963  -2.4529583 ]
 [ 0.00368916 -0.28003964]
 [ 2.827805   -3.1599944 ]
 [ 2.5646427  -2.9203513 ]
 [ 1.5777411  -1.9322644 ]
 [ 2.0924344  -2.45437   ]
 [-0.3094897   0.05432946]
 [-0.4735841   0.21962158]
 [ 2.986885   -3.2851648 ]
 [ 0.74665934 -1.0782636 ]
 [ 2.8025713  -3.150106  ]
 [ 2.8188126  -3.1432073 ]
 [ 0.33537692 -0.6387907 ]
 [ 2.4267008  -2.8125076 ]
 [ 0.5340896  -0.8449312 ]
 [ 1.6906191  -2.0481403 ]
 [ 2.4781735  -2.836065  ]
 [ 1.4660608  -1.8114294 ]
 [-0.12761389 -0.14030115]
 [ 2.169318   -2.550057  ]
 [ 1.8786501  -2.2302272 ]
 [ 2.0285375  -2.4051163 ]
 [ 0.59976137 -0.9301604 ]
 [ 2.5813186  -2.9546692 ]]


In [136]:
#for item in true_labels:
    #true_labels.extend(item.split())

print(true_labels[0])
print(type(true_labels[0]))
print(true_labels)

[1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0]
<class 'numpy.ndarray'>
[array([1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0], dtype=int64), array([0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64), array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64), array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1], dtype=int64), array([1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0], dtype=int64), array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 0], dtype=int64), array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64), array([0, 0, 0, 0, 0, 0, 0, 1, 0, 

In [137]:
for i in range(len(true_labels)):
    true_labels[i] = true_labels[i].tolist()

In [150]:
test_labels = test_labels.tolist()

In [138]:
print(type(true_labels[0]))

<class 'list'>


In [None]:
print(true_labels)

In [None]:
def flatten_list(_2d_list):
    flat_list = []
    # Iterate through the outer list
    for element in _2d_list:
        if type(element) is list:
            # If the element is of type list, iterate through the sublist
            for item in element:
                flat_list.append(item)
        else:
            flat_list.append(element)
    return flat_list

nested_list = true_labels
true_labels = flatten_list(nested_list)
print(true_labels)

In [None]:
print(test_labels)

In [152]:
n = len(test_labels)
print(n)

3000


In [153]:
n_pred =  len(true_labels)
print(n_pred)

3000


In [157]:
print(type(test_labels), type(true_labels))

<class 'list'> <class 'list'>


In [155]:
print(type(test_labels[0]), type(true_labels[0]))

<class 'int'> <class 'int'>


In [156]:
print(len(test_labels),len(true_labels))

3000 3000


In [159]:
print(true_labels[0])

1


In [160]:
print(test_labels[0])

0


In [170]:
class Metrics:
    true_positives = 0
    true_negatives = 0
    false_positves = 0 
    false_negatives = 0
    precision = 0 
    recall = 0

    def confusion_matrix(self):
        for i in range(n):
            if test_labels[i]==1 and true_labels[i]==1:
                self.true_positives +=1
            elif test_labels[i]==0 and true_labels[i]==0:
                self.true_negatives += 1 
            elif test_labels[i]==0 and true_labels[i]==1:
                self.false_positves += 1 
            else:
                self.false_negatives += 1 
        return self.true_positives, self.true_negatives, self.false_positves, self.false_negatives
    
    def precision_recall(self):
        self.precision = self.true_positives/(self.true_positives+self.false_positves)
        self.recall = self.true_positives/(self.true_positives+self.false_negatives)
        print("precision : ",self.precision, "\nRecall : ",self.recall)
    
    def f1_score(self):
        f1 = 2*(self.precision*self.recall)/(self.precision+self.recall)
        print('f1 Score : ', f1)

model = Metrics()
print(model.confusion_matrix())
model.precision_recall()
model.f1_score()
        

(156, 1758, 543, 543)
precision :  0.22317596566523606 
Recall :  0.22317596566523606
f1 Score :  0.22317596566523606
