# FastAI ULMFiT


## 0. Setup

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Installing and importing the necessary libraries
!pip install fastai --quiet
!pip install kaggle --quiet

from fastai.text.all import *

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_colwidth', None)

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## 1. Import the data

In [None]:
# Loaded dataset and lables after creating the folder structure as shown below in Google
text_path = 'gdrive/My Drive/EAI6010/Datasets/title_StackOverflow.txt'
label_path = 'gdrive/My Drive/EAI6010/Datasets/label_StackOverflow.txt'

df_text = pd.read_csv(text_path, names=['text'], header=None,on_bad_lines='skip')
df_label = pd.read_csv(label_path, names=['label'], header=None, on_bad_lines='skip')

df = pd.concat([df_label, df_text], axis=1, sort=False)
print('Length of dataset: '+str(len(df.index)))
df.head()

Length of dataset: 20000


Unnamed: 0,label,text
0,18,How do I fill a DataSet or a DataTable from a LINQ query resultset ?
1,18,How do you page a collection with LINQ?
2,3,Best Subversion clients for Windows Vista (64bit)
3,3,Visual Studio Setup Project - Per User Registry Settings
4,7,How do I most elegantly express left join with aggregate SQL as LINQ query


In [None]:
mapping = {
  1: 'wordpress',
  2: 'oracle',
  3: 'svn',
  4: 'apache',
  5: 'excel',
  6: 'matlab',
  7: 'visual-studio',
  8: 'cocoa',
  9: 'osx',
  10: 'bash',
  11: 'spring',
  12: 'hibernate',
  13: 'scala',
  14: 'sharepoint',
  15: 'ajax',
  16: 'qt',
  17: 'drupal',
  18: 'linq',
  19: 'haskell',
  20: 'magento'
}

df['label'] = df['label'].map(mapping)

df.head()

Unnamed: 0,label,text
0,linq,How do I fill a DataSet or a DataTable from a LINQ query resultset ?
1,linq,How do you page a collection with LINQ?
2,svn,Best Subversion clients for Windows Vista (64bit)
3,svn,Visual Studio Setup Project - Per User Registry Settings
4,visual-studio,How do I most elegantly express left join with aggregate SQL as LINQ query


## 2. Create train & validation datasets and FastAI data bunch

In [None]:
from sklearn.model_selection import train_test_split

df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.3)

df_trn.shape, df_val.shape

((14000, 2), (6000, 2))

In [None]:
df_trn.head()

Unnamed: 0,label,text
9593,excel,bash string to date
11386,scala,Secure AJAX connection / null character SSL cert attack?
4119,ajax,SVN and accented characters
12825,drupal,Access properties file programatically with Spring?
14160,apache,Saving settings in matlab


In [None]:
# Extract the text data only
df_lm = pd.concat([df_trn, df_val], axis=0)[['text']]
df_lm.head()

Unnamed: 0,text
9593,bash string to date
11386,Secure AJAX connection / null character SSL cert attack?
4119,SVN and accented characters
12825,Access properties file programatically with Spring?
14160,Saving settings in matlab


In [None]:
# Creating a dataloader for self-supervised learning task
dls_lm = DataBlock(
    blocks=TextBlock.from_df('text', is_lm=True),
    get_x=ColReader('text'),
    splitter=RandomSplitter(0.1))

In [None]:
dls_lm = dls_lm.dataloaders(df_lm, bs=64, seq_len=72)

In [None]:
dls_lm.show_batch(max_n = 3)

Unnamed: 0,text,text_
0,xxbos xxmaj how to have an nsmenu with dynamic actions xxbos i need to block my feed xxunk xxbos xxmaj using nstask : app freezing after returning output xxbos nan xxbos xxmaj see binary instead of hex in xxmaj binary file while in a shell like bash . xxbos xxmaj what 's wrong with this linq query ? ? xxbos xxmaj how do you create inputs for custom options in the xxmaj,xxmaj how to have an nsmenu with dynamic actions xxbos i need to block my feed xxunk xxbos xxmaj using nstask : app freezing after returning output xxbos nan xxbos xxmaj see binary instead of hex in xxmaj binary file while in a shell like bash . xxbos xxmaj what 's wrong with this linq query ? ? xxbos xxmaj how do you create inputs for custom options in the xxmaj magento
1,items from a list xxbos visual xxmaj studio - xxmaj how to change the return value of a method in the debugger ? xxbos xxmaj scala vs. xxmaj groovy vs. xxmaj clojure xxbos xxmaj table view not updating according to bindings - xxmaj part xxmaj xxunk xxbos xxmaj custom xxunk xxbos xxmaj bash script xxunk xxunk command xxbos xxmaj how do i update all svn : externals references after a server migration,from a list xxbos visual xxmaj studio - xxmaj how to change the return value of a method in the debugger ? xxbos xxmaj scala vs. xxmaj groovy vs. xxmaj clojure xxbos xxmaj table view not updating according to bindings - xxmaj part xxmaj xxunk xxbos xxmaj custom xxunk xxbos xxmaj bash script xxunk xxunk command xxbos xxmaj how do i update all svn : externals references after a server migration ?
2,an actor ? xxbos xxmaj do you always use a second - level cache in xxmaj hibernate ? xxbos xxmaj how can i connect the pressed ( ) signal of 32 buttons to a single function without declaring 32 slots ? xxbos nan xxbos xxmaj transpose a file in bash xxbos nan xxbos xxup linq - xxmaj add property to results xxbos xxmaj excel xxup odbc and 64 bit server xxbos xxmaj,actor ? xxbos xxmaj do you always use a second - level cache in xxmaj hibernate ? xxbos xxmaj how can i connect the pressed ( ) signal of 32 buttons to a single function without declaring 32 slots ? xxbos nan xxbos xxmaj transpose a file in bash xxbos nan xxbos xxup linq - xxmaj add property to results xxbos xxmaj excel xxup odbc and 64 bit server xxbos xxmaj strange


## 3. Create and Train the Language Model

In [None]:
# Saving the encoder
# Create a learner for the language model
learn_lm = language_model_learner(
    dls_lm,
    AWD_LSTM,
    metrics=[accuracy, Perplexity()],
    path='models'
)

# Fine-tune the language model
learn_lm.fine_tune(5, cbs=[SaveModelCallback(fname='best_lm')])

# Save the encoder to use in the classification task
learn_lm.save_encoder('finetuned_encoder')


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,6.262281,5.280181,0.209527,196.405411,00:17


Better model found at epoch 0 with valid_loss value: 5.280180931091309.


epoch,train_loss,valid_loss,accuracy,perplexity,time
0,5.145847,4.552901,0.284903,94.907318,00:18
1,4.757207,4.217676,0.302047,67.875534,00:18
2,4.500891,4.112182,0.312572,61.079857,00:19
3,4.362444,4.07921,0.315213,59.098751,00:19
4,4.299832,4.071228,0.314779,58.628918,00:19


Better model found at epoch 0 with valid_loss value: 4.552900791168213.
Better model found at epoch 1 with valid_loss value: 4.217675685882568.
Better model found at epoch 2 with valid_loss value: 4.112182140350342.
Better model found at epoch 3 with valid_loss value: 4.079209804534912.
Better model found at epoch 4 with valid_loss value: 4.07122802734375.


## 4. Using the Language Model to Train the Classifier

In [None]:
# Create a dataloader for the classification task
dls_clas = DataBlock(
    blocks=(TextBlock.from_df('text', vocab=dls_lm.vocab), CategoryBlock),
    get_x=ColReader('text'),
    get_y=ColReader('label'),
    splitter=RandomSplitter(0.1)
).dataloaders(df, bs=64)

In [None]:
# Show a batch from the classification dataloader
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos xxmaj how can i use xxunk to encode xxunk files within a shell script ? xxmaj i 'm trying to set xxunk information via variables with spaces in them . xxmaj xxunk xxunk out . xxmaj maybe xxmaj i 'm being xxunk with bash ?,ajax
1,xxbos xxmaj how do i put a xxmaj file ( excel ) online ( apache xxmaj server ) with xxmaj password xxmaj protection but with the xxmaj option for xxmaj users to alter the xxmaj file and save the changes ?,spring
2,xxbos xxmaj magento - xxmaj create new user returns xxup xxunk xxrep 3 0 ] [ 2002 ] xxmaj ca n't connect to local mysql server through socket ' / var / lib / mysql / xxunk ' xxmaj help ?,haskell


In [None]:
# Defining the learner
# Create a learner for the classifier
learn_clas = text_classifier_learner(
    dls_clas,
    AWD_LSTM,
    metrics=accuracy
)

In [None]:
# Load the encoder from the language model
learn_clas.load_encoder('/content/models/models/finetuned_encoder')

<fastai.text.learner.TextLearner at 0x7944b133aa70>

In [None]:
# Applying gradual unfreezing of one layer after another
# Gradually unfreeze the layers and train each step
learn_clas.freeze()  # Step 1: Freeze all layers
learn_clas.fit_one_cycle(1, slice(1e-2))  # Train the last layer(s)

learn_clas.freeze_to(-2)  # Step 2: Unfreeze the last two layers
learn_clas.fit_one_cycle(1, slice(1e-3/2, 1e-2))  # Train with smaller learning rates

learn_clas.freeze_to(-3)  # Step 3: Unfreeze the last three layers
learn_clas.fit_one_cycle(1, slice(1e-4/2, 1e-3))  # Train with even smaller learning rates

learn_clas.unfreeze()  # Step 4: Unfreeze all layers
learn_clas.fit_one_cycle(2, slice(1e-5, 1e-4))  # Train all layers


epoch,train_loss,valid_loss,accuracy,time
0,3.005572,2.909444,0.088,00:22


epoch,train_loss,valid_loss,accuracy,time
0,2.984871,2.947498,0.0695,00:24


epoch,train_loss,valid_loss,accuracy,time
0,2.961846,2.918704,0.0995,00:23


epoch,train_loss,valid_loss,accuracy,time
0,2.942377,2.91791,0.0905,00:25
1,2.935766,2.913846,0.0985,00:26


The model accuracy is very poor. To improve this, we can further try to increase number of epochs, change seq_len, and change batch size.

In [None]:
# Save the model manually
learn_clas.save('gradual_unfreeze_final')


Path('models/gradual_unfreeze_final.pth')

## 5. Analyzing our results

In [None]:
learn_clas.show_results()

Unnamed: 0,text,category,category_
0,xxbos xxmaj i xxunk a function updating database table using xxmaj spring 's jdbctemplate and for some reason there was exception that connection is read only xxunk can not update any database related changes . xxmaj how to resolve these problem ?,excel,linq
1,xxbos xxup c : \ xxmaj documents and xxmaj settings \ user \ xxmaj my xxmaj documents \ xxunk \ xxunk xxunk xxunk \ xxunk : xxmaj the module has not been deployed .,wordpress,linq
2,"xxbos xxmaj configuring xxmaj apache to route "" get / user / foo "" to / user / xxunk and "" put / user / foo "" to / user / xxunk",haskell,linq
3,xxbos xxmaj spring 3.0 - xxmaj unable to locate xxmaj spring namespacehandler for xxup xml schema namespace [ http : / / xxrep 3 w xxunk / schema / security ],haskell,cocoa
4,xxbos xxmaj excel 2 xxrep 3 0 xxup vba : xxmaj errors xxmaj raised within xxmaj class xxmaj debug xxmaj as xxmaj if xxmaj raised at xxmaj property xxmaj call,oracle,cocoa
5,xxbos xxmaj visual xxmaj studio - xxmaj new xxmaj filter instead of xxmaj new xxmaj folder when using xxmaj create xxmaj project xxmaj from xxmaj existing xxmaj source xxmaj wizard,qt,visual-studio
6,xxbos xxmaj using sub - types xxmaj and xxmaj return xxmaj types in xxmaj scala to xxmaj process a xxmaj generic xxmaj object xxmaj into a xxmaj specific xxmaj one,scala,apache
7,xxbos xxmaj how do i get output to show up in the xxmaj messages pane of the xxmaj error xxmaj list for xxmaj visual xxmaj studio 2005 ?,ajax,visual-studio
8,xxbos xxmaj how do i stop xxmaj visual xxmaj studio from launching a new browser window every single time i hit the xxmaj start xxmaj debugging button ?,oracle,visual-studio


The model struggles to correctly classify the categories which is largely due to the low accuracy achieved. Increasing the number of epochs might help with this.

## 6. Predictions

## 7. Export the model

In [None]:
learn_clas.export('text_classifier.pkl')

# Load the exported model for inference later
learn_infer = load_learner('text_classifier.pkl')


In [None]:
print(os.getcwd())

/content


In [None]:
from google.colab import drive
drive.mount('/content/drive')

This model is saved at sample_data/text_classifier.pkl