# FAST AI Library Notebook

In this notebook, you can use your own dataset and train it on FastAI's ULMFiT model. 

Author: Steven Smit

In [1]:
!pip install torch
!pip install fastai



In [0]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os
from sklearn.metrics import f1_score
from google.colab import drive
from sklearn.model_selection import train_test_split


In [3]:
#Mount your Google drive to the notebook. Follow the link, accept permissions, copy the token and paste it in the prompt that appears and press "Enter"
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Data Format

When you upload your dataset to your Google Drive, please place the documents in a column called "text" and the labels in a column called "label". The labels can be a text or numerical label.

In [0]:
tasks = ['amazon_movies', 'amazon_health', 'amazon_books', 'semeval_task_a', 'semeval_task_ce']
sub_tasks = ['fair_balanced_100', 'fair_balanced_300', 'fair_balanced_1000'] #'fair_unbalanced_100', 'fair_unbalanced_300', 'fair_unbalanced_1000'

MAX_SEQ_LENGTH = 128

# Model configs
SAVE_CHECKPOINTS_STEPS = 10000
SAVE_SUMMARY_STEPS = 100

# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 10.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1


In [0]:
task = tasks[0]
subtask = sub_tasks[0]

df_trn = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/train.csv")
df_val = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/validate.csv")
df_tst = pd.read_csv("/content/gdrive/My Drive/Data Science/Low Shot NLP/datasets/"+task+"/"+subtask+"/test.csv")

df_trn = df_trn[["text", "label"]]
df_val = df_val[["text", "label"]]
df_tst = df_tst[["text", "label"]]

df_trn['text'] = df_trn['text'].astype(str)
df_val['text'] = df_val['text'].astype(str)
df_tst['text'] = df_tst['text'].astype(str)

df_trn = df_trn.sample(frac=1)
df_val = df_val.sample(frac=1)
df_tst = df_tst.sample(frac=1)

In [6]:
df_trn.head()

Unnamed: 0,text,label
102,This has been mentioned ad nauseam by other re...,Neutral
25,It's hard to know where to begin this review -...,Positive
66,Although a lot of people may view 'Con Air' as...,Positive
177,The car chase is one of the most inexplicable ...,Neutral
213,Some movies are bad and worth watching 'cos th...,Negative


In [0]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, test_df = df_tst, vocab=data_lm.train_ds.vocab, bs=32)

learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.7)

In [8]:
# train the learner object with learning rate = 1e-3
# finetune the language model for 10 epochs. Feel free to run this again if you feel it requires more finetuning.
num_epochs = 10
learn.fit(num_epochs, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,6.497739,7.252922,0.000264,00:01
1,6.168694,7.193324,0.000223,00:00
2,6.355566,7.123382,0.000304,00:00
3,6.304291,7.047851,0.000264,00:00
4,6.335325,6.969794,0.000223,00:00
5,6.329931,6.887304,0.000223,00:00
6,6.294367,6.804756,0.000244,00:00
7,6.271507,6.714931,0.000325,00:00
8,6.191869,6.627574,0.000223,00:00
9,6.121831,6.539783,0.000203,00:00


In [0]:
learn.save_encoder('ft_enc')
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)
learn.load_encoder('ft_enc')

In [10]:
#Train the classifier for 10 epochs. 
#If you're not happy with the final validation accuracy, then tweak num_epochs and run this cell as many times as you need.
#accuracy represents validation accuracy
num_epochs = 40
learn.fit(num_epochs, 1e-4)

epoch,train_loss,valid_loss,accuracy,time
0,5.764927,5.711463,0.0,00:00
1,5.76286,5.728346,0.0,00:00
2,5.755872,5.764743,0.0,00:00
3,5.752809,5.799076,0.0,00:00
4,5.757849,5.831359,0.0,00:00
5,5.762232,5.865104,0.0,00:00
6,5.758498,5.857911,0.0,00:00
7,5.755494,5.878794,0.0,00:00
8,5.753711,5.883905,0.0,00:00
9,5.758022,5.875309,0.0,00:00


In [11]:
#Obtain the predictions and the targets to plot the confusion matrix.
preds, targets = learn.predict(item=)

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

ValueError: ignored

In [0]:
#Print the f1 score of this model
f1_score(targets, predictions, average="micro")

0.7226890756302521