<a href="https://colab.research.google.com/github/Pratkashyap/Emission-Tracker/blob/dev/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.0.0.dev20181206-cp36-cp36m-linux_x86_64.whl (576.2MB)
[K    100% |████████████████████████████████| 576.2MB 28kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.0.0.dev20181206
Collecting wrapt<1.11.0,>=1.10.0 (from thinc<6.13.0,>=6.12.1->spacy>=2.0.18->fastai)
  Downloading https://files.pythonhosted.org/packages/a0/47/66897906448185fcb77fc3c2b1bc20ed0ecca81a0f2f88eda3fc5a34fc3d/wrapt-1.10.11.tar.gz
Building wheels for collected packages: wrapt
  Building wheel for wrapt (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/48/5d/04/22361a593e70d23b1f7746d932802efe1f0e523376a74f321e
Successfully built wrapt
[31mspacy 2.0.18 has requirement numpy>=1.15.0, but you'll have numpy 1.14.6 which is incompatible.[0m
Installing collect

In [0]:
# import libraries
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})

In [5]:
df.shape

(11314, 2)

In [0]:
#select labels 1 and 10 which correspond to ‘comp.graphics’ and ‘rec.sport.hockey’
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)


In [7]:
df.shape

(1184, 2)

In [8]:
#a quick look at the target distribution.

df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

In [0]:
#DATA PROCSSING
#clean our text by retaining only alphabets and removing everything else
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

In [10]:
#Now, we will get rid of the stopwords from our text data. 
#If you have never used stopwords before, then you will have to download them from the nltk package as shown below

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# tokenization 
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

In [0]:
#split our cleaned dataset into training and validation sets in a 60:40 ratio.

from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

In [13]:
df_trn.shape, df_val.shape

((710, 2), (474, 2))

In [0]:
#we’ll need to prepare our data for the language model and for the classification model separately. 
#The good news? This can be done quite easily using the fastai library

# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [0]:
#We can use the data_lm object we created earlier to fine-tune a pre-trained language model. 
#We can create a learner object, ‘learn’, that will directly create a model, download the pre-trained weights, and be ready for fine-tuning

learn = language_model_learner(data_lm, pretrained_model=URLs.WT103, drop_mult=0.7)

In [16]:
# train the learner object with learning rate = 1e-2
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,7.820796,6.333888,0.130485


In [0]:
#We will save this encoder to use it for classification later
learn.save_encoder('ft_enc')

In [0]:
#now use the data_clas object we created earlier to build a classifier with our fine-tuned encoder.

learn = text_classifier_learner(data_clas, drop_mult=0.7)
learn.load_encoder('ft_enc')

In [19]:
#will again try to fit our model.

learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy
1,0.508184,0.272862,0.938819


In [20]:
#can even get the predictions for the validation set out of the learner object by using the below code

# get predictions
preds, targets = learn.get_preds()

predictions = np.argmax(preds, axis = 1)
pd.crosstab(predictions, targets)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,211,6
1,23,234
