In [None]:
# Imports
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import drive
import urllib.request
!pip install transformers
!pip install tokenizers
from transformers import *
import tokenizers

# Google enviornment
drive.mount('/content/gdrive')

# While we're here, we might as well check what GPU we have (note that I am using Colab Pro):
!nvidia-smi

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Sat Aug 22 06:19:53 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.57       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                     

In [None]:
# I would normally do data visualization, but I'm short on time today. Begin data wrangling! :)

# We start by initializing our tokenizer from huggingface's tokenizers
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file='roberta-base-vocab.json', 
    merges_file='roberta-base-merges.txt',
    lowercase=True,
    add_prefix_space=False,
    max_length=256
)

# Next, we load our CSV's using pandas
train = pd.read_csv('/content/gdrive/My Drive/ignitionhacks/training_data.csv')
val = pd.read_csv ('/content/gdrive/My Drive/ignitionhacks/val.csv')

# Now we can preprocess our data (making sure that we do the same to test and train). 

from transformers import AutoTokenizer, TFAutoModelWithLMHead
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-xlm-roberta-base")

trainenc=[]
trainattn=[]

valenc=[]
valattn=[]

# First, we encode the text from the panda's dataframe we loaded earlier
for i in train.Text:
  enc = tokenizer.encode(i)
  trainenc.append(enc)

# Keras has some quick and easy preprocessing that I don't have time to write a function for, it works fine (and converts to a tensor)
trainenc = tf.keras.preprocessing.sequence.pad_sequences(trainenc, maxlen=256, dtype="long", value=0, truncating="post", padding="post")

# Now we make an attention mask
for i in trainenc:
  att=[int(x > 0) for x in i]
  trainattn.append(att)

# Finally, we just put all of our training and validation data into a tf.dataset so it loads faster (from experience, Colab I/O is painfully slow)
train = tf.data.Dataset.from_tensor_slices((trainenc, trainattn, train.Sentiment))

# Same thing for our validation data
for i in val.Text:
  enc = tokenizer.encode(i)
  valenc.append(enc)  

valenc = tf.keras.preprocessing.sequence.pad_sequences(valenc, maxlen=256, dtype="long", value=0, truncating="post", padding="post")

for i in valenc:
  att=[int(x > 0) for x in i]
  valattn.append(att)

val = tf.data.Dataset.from_tensor_slices((valenc, valattn, val.Sentiment))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


KeyboardInterrupt: ignored

In [None]:
# Finally (though it didn't take *that* long), we are ready to train. 

class roBERTaClassifier(tf.keras.Model):    
    def __init__(self, bert: TFBertModel, num_classes: int):
        super().__init__()
        self.bert = bert
        self.classifier = tf.keras.layers.Dense(num_classes, activation='sigmoid')
        
    @tf.function
    def call(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
        outputs = self.bert(input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids,
                               position_ids=position_ids,
                               head_mask=head_mask)
        cls_output = outputs[1]
        cls_output = self.classifier(cls_output)
                
        return cls_output

model = roBERTaClassifier(TFBertModel.from_pretrained("jplu/tf-xlm-roberta-base"), 1)

# We can define some metrics to better evaluate our model
metrics = [
    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.Precision(name="precision"),
    tf.keras.metrics.AUC(name="auc")
]

# Finally, we can train after compiling our model with some hyperparameters. Again, we will be using Keras since it's quick and easy. 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-7)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics = metrics)
!mkdir /content/gdrive/My\ Drive/ignitionhacks/best
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/gdrive/My Drive/ignitionhacks/best',
    save_weights_only=False,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

batch_size=1024

model.fit(train, validation_data=val, epochs=55, batch_size=batch_size, steps_per_epoch=len(train)//batch_size, callbacks=model_checkpoint_callback) # Yes I know, massive batch size. We're definitely reducing our accuracy with such a massive batch size, but the dataset's just... huge and time is limited.
!mkdir /content/gdrive/My\ Drive/ignitionhacks/final
model.save('/content/gdrive/My Drive/ignitionhacks/')
#model.fit((trainenc, trainattn), train.Sentiment, validation_data=((valenc,valattn), val.Sentiment), batch_size=32, epochs=150)
# Hyperparameters were tuned using intuition

# Now we play the waiting game.

In [None]:
# Finally, we can do inference based on contestant_judgement (private test data)
test_csv = pd.read_csv('/content/gdrive/My Drive/ignitionhacks/contestant_judgment.csv')

# We can copy paste all of our preprocessing from before
testenc=[]
testattn=[]

for i in test_csv.Text:
  enc = tokenizer.encode(i)
  testenc.append(enc)

testenc = tf.keras.preprocessing.sequence.pad_sequences(testenc, maxlen=128, dtype="long", value=0, truncating="post", padding="post")

for i in testenc:
  att=[int(x > 0) for x in i]
  testattn.append(att)

test = tf.data.Dataset.from_tensor_slices((testenc, testattn))

print('encoded')

# Make predictions using the train dataset
predictions = model.predict(test)

print('predicted')

# Create a new column in our dataset for our predictions
test_csv['Sentiment'] = predictions

# Save to csv for final submission!
test_csv.to_csv('submisision.csv')
!mkdir /content/gdrive/My\ Drive/ignitionhacks/
!cp submission.csv /content/gdrive/My\ Drive/ignitionhacks/

print('done')

# Sidenote: I'm not sure why, but this takes by far the longest? Odd.

[[0, 87, 661, 106013, 1639, 26292, 214, 1257, 1374, 555, 2706, 32854, 275, 8, 1042, 106, 442, 26950, 40575, 619, 25133, 74, 116, 442, 25, 7, 5773, 759, 8, 1042, 38, 2], [0, 468, 8912, 3521, 79520, 20, 3395, 28282, 4589, 31182, 4, 27150, 56, 136, 11522, 16487, 166204, 7, 111, 61261, 2], [0, 110, 3917, 17, 13319, 3444, 70, 9742, 47, 3564, 2], [0, 1374, 13025, 2749, 12248, 3390, 2673, 169006, 67921, 38, 24044, 214, 125177, 9393, 6, 128258, 104711, 32149, 14804, 42261, 38, 3610, 1760, 83, 26368, 1236, 3229, 2583, 35691, 33233, 6863, 48800, 7, 38, 6, 128258, 27204, 34162, 32149, 2], [0, 1374, 420, 21135, 47327, 1836, 621, 64507, 2], [0, 1374, 238, 30513, 6696, 7039, 9563, 4163, 5368, 38, 3493, 1295, 2367, 87, 25, 272, 49782, 54133, 6, 88507, 63175, 335, 145688, 23373, 18057, 46754, 83435, 90827, 15969, 15969, 6, 239019, 11305, 2], [0, 1374, 206, 416, 62809, 7260, 113, 2037, 113, 2037, 41866, 3229, 6863, 17, 5, 18, 2], [0, 17, 765, 10, 3525, 5161, 12, 2589, 18, 2515, 720, 111, 2510, 88142, 5

In [None]:
!ls