# FAST AI Library Notebook

In this notebook, you can use your own dataset and train it on FastAI's ULMFiT model. 

Author: Steven Smit

In [1]:
!pip install torch
!pip install fastai



In [0]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
from sklearn.metrics import f1_score
from google.colab import drive
from sklearn.model_selection import train_test_split


In [3]:
#Mount your Google drive to the notebook. Follow the link, accept permissions, copy the token and paste it in the prompt that appears and press "Enter"
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Data Format

When you upload your dataset to your Google Drive, please place the documents in a column called "text" and the labels in a column called "label". The labels can be a text or numerical label.

In [0]:
tasks = ['amazon_movies', 'amazon_health', 'amazon_books', 'semeval_task_a', 'semeval_task_ce']
sub_tasks = ['fair_balanced_100', 'fair_balanced_300', 'fair_balanced_1000'] #'fair_unbalanced_100', 'fair_unbalanced_300', 'fair_unbalanced_1000'

MAX_SEQ_LENGTH = 128

# Model configs
SAVE_CHECKPOINTS_STEPS = 10000
SAVE_SUMMARY_STEPS = 100

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 10.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1


In [0]:
task = tasks[0]
subtask = sub_tasks[0]

df_trn = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/train.csv")
df_val = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/validate.csv")
df_tst = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/test.csv")

df_trn = df_trn[["text", "label"]]
df_val = df_val[["text", "label"]]
df_tst = df_tst[["text", "label"]]

df_trn['text'] = df_trn['text'].astype(str)
df_val['text'] = df_val['text'].astype(str)
df_tst['text'] = df_tst['text'].astype(str)

df_trn = df_trn.sample(frac=1)
df_val = df_val.sample(frac=1)
df_tst = df_tst.sample(frac=1)

In [7]:
df_trn.head()

Unnamed: 0,text,label
200,This movie should be sent free and have a $20....,Negative
76,This film is a bit of a Mad Max nuclear wastel...,Positive
29,ARNOLD SCHWARZENEGGER ! I HAD NOT SAID WHITCH ...,Positive
296,what they should have done was created the une...,Negative
284,"I looked forward to this movie for a year, and...",Negative


## Downsampling Classes

Run this cell if you'd like to downsample your classes to make them equally balanced. It will downsample to the class with the lowest documents. 

In [0]:
lowest_class_num = min(data["label"].value_counts())
print("Smallest class size: {}".format(lowest_class_num))
downsampled_data = data.groupby("label").apply(lambda x: x.sample(n=lowest_class_num, replace=False)).reset_index(drop=True).loc[:, ["text", "label"]]
downsampled_data = downsampled_data.sample(frac=1).reset_index(drop=True)

Smallest class size: 749


In [0]:
data = downsampled_data
data = data[["label", "text"]]

In [0]:
# split data into training and validation set

def my_round(x, base=.05, prec=2):
    return round(base * round(float(x) / base), prec)


def split_dataset_into_even_class_distributions(X_data, Y_data, train_size=.7, test_size=.2, mini_batch_size=32):
    """
    Split a dataset into train, validate, and test sets that have the same distribution of classes as the original data.
    Also this function ensures the resulting datasets are multiples of the mini_batch_size

    :param X_data: Numpy 2d array or pandas dataframe, each row is a record
    :param Y_data: Pandas Series, the categorial class labels
    :param train_size: float 0-1, quantity of the original dataset to be in the train set (val is created from the residual)
    :param test_size: float 0-1, quantity of the original dataset to be in the test set (val is created from the residual)
    :param mini_batch_size: Int, the size of the mini batches to ensure each dataset is a multiple of that values

    :return: 6 data sets of X and Y
    """

    dist = Counter(val for val in Y_data)
    print('Total class distribution:', dict(dist))

    Y_data.reset_index(drop=True, inplace=True)
    X_data.reset_index(drop=True, inplace=True)

    train_ix = []
    validate_ix = []
    test_ix = []
    for cls in dist.keys():
        num_train = my_round(dist[cls] * train_size, base=mini_batch_size)
        num_test = my_round(dist[cls] * test_size, base=mini_batch_size)

        cls_targets = Y_data[Y_data == cls]

        full_ixs = np.random.choice(cls_targets.index, size=num_train, replace=False)

        train_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        full_ixs = np.random.choice(cls_targets.index, size=num_test, replace=False)

        test_ix += list(full_ixs)

        cls_targets = cls_targets[~cls_targets.index.isin(full_ixs)]

        validate_ix += list(cls_targets.index)

    X_train = X_data[train_ix]
    Y_train = Y_data[train_ix].reset_index(drop=True)

    X_validate = X_data[validate_ix]
    Y_validate = Y_data[validate_ix].reset_index(drop=True)

    X_test = X_data[test_ix]
    Y_test = Y_data[test_ix].reset_index(drop=True)

    print('Train class distribution:', dict(Counter(val for val in Y_train)))
    print('Validate class distribution:', dict(Counter(val for val in Y_validate)))
    print('Test class distribution:', dict(Counter(val for val in Y_test)))

    return X_train, Y_train, X_validate, Y_validate, X_test, Y_test

# Split
X_train, Y_train, X_validate, Y_validate, X_test, Y_test = split_dataset_into_even_class_distributions(X_data=data['text'],
                                                                                                       Y_data=data['label'], 
                                                                                                       train_size=.7, 
                                                                                                       test_size=.2,
                                                                                                       mini_batch_size=1)

df_trn = pd.DataFrame(np.array([Y_train, X_train]).T, columns=['label', 'text'])
df_val = pd.DataFrame(np.array([Y_validate, X_validate]).T, columns=['label', 'text'])
df_tst = pd.DataFrame(np.array([Y_test, X_test]).T, columns=['label', 'text'])

Total class distribution: {5: 749, 1: 749, 3: 749}
Train class distribution: {5: 524, 1: 524, 3: 524}
Validate class distribution: {5: 75, 1: 75, 3: 75}
Test class distribution: {5: 150, 1: 150, 3: 150}


In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, test_df = df_tst, vocab=data_lm.train_ds.vocab, bs=32)

learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)

In [0]:
# train the learner object with learning rate = 1e-3
# finetune the language model for 10 epochs. Feel free to run this again if you feel it requires more finetuning.
num_epochs = 10
learn.fit(num_epochs, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.215666,4.59604,0.235893,00:21
1,5.013382,4.43996,0.243839,00:21
2,4.850436,4.325118,0.250112,00:21
3,4.71006,4.227103,0.253147,00:21
4,4.595787,4.153209,0.253504,00:21
5,4.498746,4.090568,0.257165,00:21
6,4.400766,4.043637,0.258906,00:21
7,4.330935,4.0066,0.26125,00:21
8,4.278691,3.98338,0.260513,00:21
9,4.225272,3.959489,0.262679,00:21


In [0]:
learn.save_encoder('ft_enc')
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('ft_enc')

In [0]:
#Train the classifier for 10 epochs. 
#If you're not happy with the final validation accuracy, then tweak num_epochs and run this cell as many times as you need.
#accuracy represents validation accuracy
num_epochs = 40
learn.fit(num_epochs, 1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,0.961685,0.878216,0.6,00:25
1,0.954972,0.888503,0.6,00:25
2,0.944702,0.879545,0.608889,00:23
3,0.938524,0.86743,0.617778,00:24
4,0.93274,0.859503,0.613333,00:24
5,0.920457,0.850879,0.613333,00:24
6,0.93231,0.841222,0.64,00:26
7,0.944704,0.850787,0.608889,00:26
8,0.930698,0.842357,0.622222,00:24
9,0.920778,0.830192,0.635556,00:24


KeyboardInterrupt: ignored

In [0]:
#Obtain the predictions and the targets to plot the confusion matrix.
preds, targets = learn.predict(item=data_clas.test_ds)

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

ValueError: ignored

In [0]:
#Print the f1 score of this model
f1_score(targets, predictions, average="micro")

0.7226890756302521