In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

This notebook uses [fast.ai2](https://docs.fast.ai/tutorial.text) to perform transfer learning on the ULMFit model for classification of IMDb movie reviews.
Steps -
* Specify path to the data using fast.ai URLs.
* Create the LM [DataLoaders](https://docs.fast.ai/data.core.html#DataLoaders) using TextDataLoaders.
* Fit the LM.
* Finetune the LM  using language_model_learner.
* Save the encoder.
* Create the classifier.
* Load the the pretrained weights(encoder) into the classifer.
* Fit the classifier.
* Finetune the classifier.
* Use it to classify reviews.

In [None]:
from fastai.text.all import *

In [None]:
path = untar_data(URLs.IMDB)
path.ls()

In [None]:
(path/'train').ls()

**Get the text in DataLoaders using TextDataLoaders**, here valid_pct specifies the percentage of the text to be used for validation.

In [None]:
dls_lm = TextDataLoaders.from_folder(path, is_lm=True, valid_pct=0.1)

In [None]:
dls_lm.show_batch(max_n=5)

xxmaj and xxbos are tokens added by the method to specify upper-case and the start of a sentence respectively.

**Group together the model, the DataLoaders, and a loss function to get a Learner for a Language Model using language_model_learner.**

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, metrics=[accuracy, Perplexity()], path=path, wd=0.1).to_fp16()

**We first train only the last layer, keeping the body intact. The body of a pretrained model is frozen by default.**

In [None]:
learn.fit_one_cycle(1, 1e-2)

Saving the state of the model. *we can load it later using learn = learn.load('1epoch')*

In [None]:
learn.save('1epoch')

**Now, we finetune the model after unfreezing.**

In [None]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-3)

We save everything except the last layer of the model, this is called the 'encoder'

In [None]:
learn.save_encoder('finetuned')

Testing out our model on a text to generate words 

In [None]:
TEXT = "I liked this movie because"
N_WORDS = 40
N_SENTENCES = 2
preds = [learn.predict(TEXT, N_WORDS, temperature=0.75) 
         for _ in range(N_SENTENCES)]
print("\n".join(preds))

**Loading text as DataLoaders for the classfication model**, we specify the vocab as the vocab of the LM

In [None]:
dls_clas = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', text_vocab=dls_lm.vocab)

**Define our classifier using text_classifier_learner**

In [None]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy)

**Load the previously saved encoder.**

In [None]:
learn = learn.load_encoder('finetuned')

**Train the final layer of the classifier**

In [None]:
learn.fit_one_cycle(1, 2e-2)

**Now we train the classifier layers with different learning rates and gradual unfreezing**, an explanation of why this is done - [fast.ai NLP course lecture 9](https://youtu.be/5gCQvuznKn0)

In [None]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

In [None]:
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

**Finally, unfreeze and train the whole model**

In [None]:
learn.unfreeze()
learn.fit_one_cycle(2, slice(1e-3/(2.6**4),1e-3))

In [None]:
learn.show_results()

**Try out the model on the Kaggle IMDB data**

In [None]:
df_data = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df_data.describe()
df_data.head(2)

In [None]:
print(df_data.iloc[3,1])
print(df_data.iloc[3,2])
learn.predict(df_data.iloc[3,1])