`pip install keras`
`pip install transformers`
`pip install ipywidgets`
`pip install torchvision`

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re, os

from transformers import BertTokenizer

In [2]:
df = pd.read_csv('../data/cleandata/cleaned.csv')
df

Unnamed: 0,text,category
0,donald trump sends embarrassing new year eve m...,1
1,drunk bragging trump staffer started russian c...,1
2,sheriff david clarke internet joke threatening...,1
3,trump obsessed obama coded website image chris...,1
4,pope francis called donald trump christmas spe...,1
...,...,...
44893,fully committed nato back new u approach afgha...,0
44894,lexisnexis withdrew product chinese marketlond...,0
44895,minsk cultural hub authoritiesminsk reuters sh...,0
44896,vatican upbeat possibility pope francis visiti...,0


In [3]:
df.describe()

Unnamed: 0,category
count,44898.0
mean,0.522985
std,0.499477
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [4]:
df['category'].value_counts()

1    23481
0    21417
Name: category, dtype: int64

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

input_ids = []
lengths = []

documents = df.text.values
categories = df.category.values

# tokenize each document. since BERT doensn't like token length > 512 there might be a huge output oops
for doc in documents:
    encoded_sent = tokenizer.encode(doc, add_special_tokens=True)
    input_ids.append(encoded_sent) #add encoded document to input
    lengths.append(len(encoded_sent)) #record truncated length of document

print("DONE")

Token indices sequence length is longer than the specified maximum sequence length for this model (523 > 512). Running this sequence through the model will result in indexing errors


DONE


In [10]:
# apparently truncating the model down to a much smaller max length is good? Im giving it a try
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 128

input_ids = pad_sequences(input_ids, maxlen = MAX_LEN, dtype = "long",
                          value = 0, truncating = "post", padding = "post")

print("DONE")

DONE


In [12]:
# create attention masks -- tell the model the difference between actual token vs padding toekn
attention_masks = []

for doc in input_ids:
    att_mask = [int(token_id > 0) for token_id in doc]
    attention_masks.append(att_mask)
    
print("DONE")

DONE


In [16]:
# split the dataset 9:1 training:validation

from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, categories,
                                                                                   random_state = 2018, test_size = 0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, categories,
                                             random_state=2018, test_size=0.1)

In [21]:
#convert inputs/categories into pytorch data types
import torch
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [23]:
# creating an iterator for the dataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [24]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

ImportError: 
BertForSequenceClassification requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
