In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *

import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from fastai.rnn_reg import *
from fastai.rnn_train import *
from fastai.nlp import *
from fastai.lm_rnn import *

import dill as pickle
import spacy


PATH='/home/wk/myProjects/data/Enron/tag/'

TRN_PATH = 'train/'
VAL_PATH = 'test/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

bs=32; bptt=500
em_sz = 300  # size of each embedding vector
nh = 500     # number of hidden activations per layer
nl = 3       # number of layers

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
TEXT = pickle.load(open(f'{PATH}models/TEXT.pkl','rb'))
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [2]:
class EmailDataset(torchtext.data.Dataset):
    def __init__(self, path, text_field, label_field, **kwargs):
        fields = [('text', text_field), ('label', label_field)]
        examples = []
        for label in ['deleted_items', 'sent']:
            fnames = glob(os.path.join(path, label, '*.txt'));
            print(path)
            assert fnames, f"can't find 'yes.txt' or 'no.txt' under {path}/{label}"
            for fname in fnames:
#                with open(fname, 'r') as f: text = f.readline()
                with open(fname, 'r') as myfile:
                    text=myfile.read().replace('\n', ' ')
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.text)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [3]:
EMAIL_LABEL = data.Field(sequential=False)
splits = EmailDataset.splits(TEXT, EMAIL_LABEL, PATH, train="train", test ="test")

/home/wk/myProjects/data/Enron/tag/train
/home/wk/myProjects/data/Enron/tag/train
/home/wk/myProjects/data/Enron/tag/test
/home/wk/myProjects/data/Enron/tag/test


In [4]:
md2 = TextData.from_splits(PATH, splits, bs)

In [5]:
t = splits[0].examples[1480]
t.label, ' '.join(t.text)

('deleted_items',
 'what happened to you at cat"s? we could not find you. ckl -----original message----- from: @@sndr_email@@ sent: monday, october 22, 2001 10:59 am to: @@recr_email1@@ subject: re: thestreet: trusts keeping enron off balance this is unbelievable. seems like the worst is yet to come. if they have to end up issuing more stock this thing can go to $10. -----original message----- from: @@recr_email1@@ sent: monday, october 22, 2001 10:27 am to: @@sndr_email@@ ; bass, eric; plauche, stephen subject: fw: thestreet: trusts keeping enron off balance hey plauch, still think it is impossible for this stock to take on a one-handle. just read below. ckl -----original message----- from: @@sndr_email@@ sent: monday, october 22, 2001 8:33 am to: @@recr_email1@@ subject: thestreet: trusts keeping enron off balance << file: trusts keeping enron off balance.htm >>')

In [6]:
t = splits[0].examples[39200]
t.label, ' '.join(t.text)

('sent',
 'i\'ve heard that the turbines are supposed to be moved out of westlb today. the contract with ge is not quite finished, although it is close. kay ---------------------- forwarded by kay mann/corp/enron on @@othr_dt@@ 08:54 am --------------------------- enron global finance from: @@sndr_email@@ :44 am to: @@recr_email1@@ @@recr_email2@@ @@recr_email3@@ : subject: austin project purchase option ---------------------- forwarded by catherine clark/hou/ect on @@othr_dt@@ 08:45 am --------------------------- "taylor, rob" < @@othr_em@@ > on @@othr_dt@@ 12:55:59 pm to: @@recr_email1@@ @@recr_email2@@ @@recr_email3@@ : subject: austin project purchase option attached is a revised draft of the purchase option assignment and assumption agreement for the lm6000s to be used in the austin project. the attached reflects the comments of winston & strawn. also attached is an execution copy of the agreement. serial numbers and other identifying information should be inserted on schedule 1 t

In [7]:
m3 = md2.get_model(opt_fn, 1500, bptt, emb_sz=em_sz, n_hid=nh, n_layers=nl, 
           dropout=0.1, dropouti=0.4, wdrop=0.5, dropoute=0.05, dropouth=0.3)
m3.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
m3.load_encoder(f'adam3_10_enc')

In [8]:
m3.clip=25.
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])

In [9]:
m3.freeze_to(-1)
m3.fit(lrs, 1, metrics=[accuracy])
m3.unfreeze()
m3.fit(lrs*2, 1, metrics=[accuracy], cycle_len=1)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      0.665167   0.757712   0.531123  



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      0.526882   0.573084   0.572593  



[array([0.57308]), 0.5725932383262583]

In [10]:
m3.fit(lrs*1.5, 4, metrics=[accuracy], cycle_len=2, cycle_save_name='enron_cls')

HBox(children=(IntProgress(value=0, description='Epoch', max=8), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                      
    0      0.329249   0.123595   0.968644  
    1      0.279512   0.263209   0.959642                      
    2      0.29833    0.092091   0.977908                      
    3      0.284962   0.082588   0.977989                      
    4      0.290142   0.111513   0.973378                      
    5      0.253702   0.092737   0.973733                      
    6      0.267484   0.150458   0.974341                      
    7      0.234196   0.07471    0.98029                       



[array([0.07471]), 0.9802902733258252]

In [19]:
m3.load_cycle('enron_cls', 3)

In [20]:
accuracy_np(*m3.predict_with_targs())

0.9148366363149437

In [16]:
??m3

In [21]:
??m3.predict_with_targs

In [22]:
a=m3.predict_with_targs()

In [26]:
len(a[1])

7468