### Goal: analyze sentiment and build a language model 

In [1]:
from fastai.text import *
import html 

### Data Processing 

Goal is to put data into csv format such that no headers, no index, coluns are label (0,1) and text to the right 

In [2]:
#Define some releveant paths and variables 

BOS = "xbos" #sentence start 
FLD = "fld" #data field tag 

PATH = Path("../../Data/imdb/aclImdb")
CLASS_PATH = Path("../../Data/imdb/imdb_class/") #sentiment model 
LM_PATH = Path("../../Data/imdb/imdb_lm") #language model 

CLASS_PATH.mkdir(exist_ok=True)
LM_PATH.mkdir(exist_ok=True)

In [None]:
#These are the possible classes (positve, negative, unsupervised (posorneg dontknow!))
CLASSES = ['neg', 'pos', 'unsup']

#get a list containing all trn_txts and val_texts, as well as the labels for those texts 
def get_texts(path):
    c = 0
    texts, labels = [], []
    for idx, label in enumerate(CLASSES):
        for fname in (path/label).glob("*.*"):
            c += 1
            texts.append(fname.open('r', encoding="ANSI").read())
            labels.append(idx) 
            if c%1000 == 0 :
                print (c)
    return np.array(texts), np.array(labels)

trn_texts, trn_labels = get_texts(PATH/'train')
val_texts, val_labels = get_texts(PATH/'test')

len(trn_texts), len(val_texts)

### Sentiment Model Data

In [5]:
#take the lists and randomize order for both sets 
np.random.seed(42)
trn_idx =  np.random.permutation(len(trn_texts))
val_idx = np.random.permutation(len(val_texts))

trn_texts = trn_texts[trn_idx]
trn_labels = trn_labels[trn_idx]

val_texts = val_texts[val_idx]
val_labels = val_labels[val_idx]

In [6]:
#create pandas dataframes
col_names = ['label', 'text']
df_trn = pd.DataFrame({'text': trn_texts, 'label': trn_labels}, columns=col_names)
df_val = pd.DataFrame({'text': val_texts, 'label': val_labels}, columns=col_names)

#create csvs from dataframes, don't include unsup images!
df_trn[df_trn['label'] != 2].to_csv(CLASS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLASS_PATH/'val.csv', header=False, index=False)

#classes.txt to list the classes, each on a new line 
(CLASS_PATH/'classes.txt').open('w').writelines(f'{o}\n' for o in CLASSES)
(CLASS_PATH/'classes.txt').open('r').readlines()

['neg\n', 'pos\n', 'unsup\n']

### Language Model Data

In [7]:
#split data into 90% and 10%
trn_split, val_split = sklearn.model_selection.train_test_split(np.concatenate([trn_texts, val_texts]), test_size=0.1)
len(trn_split), len(val_split)

(90000, 10000)

In [8]:
#To keep format consistent, just have a column of 0s for label 
df_trn = pd.DataFrame({'text':trn_split, 'label':[0]*len(trn_split)}, columns=col_names)
df_val = pd.DataFrame({'text':val_split, 'label':[0]*len(val_split)}, columns=col_names)

df_trn.to_csv(LM_PATH/'train.csv', index=False, header=False)
df_val.to_csv(LM_PATH/'val.csv', index=False, header=False)

### Tokenizing and Numericalizing Language Model 

In [57]:
chunksize = 24000 #lets pandas return a generator that lets us iterate over chunks of data frame 

re1 = re.compile(r'  +')

#remove all the strange stuff from our txt 
def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))


def get_texts(df, n_lbls=1):
    """
    returns tokenized text and labels for a chunk of the entire dataset
    can recieve datasets with multiple label columns 
    """
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)):
        #if there are additional columns for each text (i.e. intro, concl)
        texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(fixup).values)
    
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

def get_all(df, n_lbls):
    tok, labels = [], [] 
    for i, r in enumerate(df): #loop over each chunk
        print(i)
        tok_, labels_ = get_texts(r,n_lbls)
        tok += tok_
        labels += labels_
    return tok, labels

In [58]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'val.csv', header=None, chunksize=chunksize)

tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)


0
1
2
3
0


In [60]:
(LM_PATH/'tmp').mkdir(exist_ok=True)
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [3]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [4]:
#Numericalize
freq = Counter([c for i in tok_trn for c in i])
freq.most_common(25)

[('the', 1208310),
 ('.', 992858),
 (',', 986179),
 ('and', 588051),
 ('a', 584103),
 ('of', 524990),
 ('to', 485562),
 ('is', 393480),
 ('it', 341658),
 ('in', 337461),
 ('i', 308662),
 ('this', 270704),
 ('that', 261284),
 ('"', 237440),
 ("'s", 221400),
 ('-', 188072),
 ('was', 180574),
 ('\n\n', 179647),
 ('as', 165776),
 ('with', 159485),
 ('for', 159023),
 ('movie', 157898),
 ('but', 150419),
 ('film', 144111),
 ('you', 124600)]

In [5]:
#limit the size of the vocab based on min freq
max_vocab = 60000
min_freq = 2 

#create ITOS, STOI
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

#will return 0 if not in dict 
stoi = collections.defaultdict(lambda:0, {a:b for b,a in enumerate(itos)})

len(itos)

60002

In [6]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [7]:
#Visualize tokenized and numericalized data 
#' '.join(str(o) for o in tok_trn[0])
#' '.join(str(o) for o in trn_lm[0])

In [10]:
#must save itos otherwise we have a list of random numbers 
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [4]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy') 
itos = pickle.load( (LM_PATH/'tmp'/'itos.pkl').open("rb"))

### Pretrained Network

In [5]:
#Our model needs same size as pretrained 
em_sz, nh, nl = 400, 1150, 3

#Path to pretrained folder and path to language model 
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

#returns dict with layer name and tensor/array of weights 
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc:storage)
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)

#get itos and create stoi for wikitext model itos  
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1 , {v:o for o,v in enumerate(itos2)})

In [6]:
#we must rearrange embedding weights to correspond with our itos 
new_w = np.zeros((len(itos), em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w] #int value of imbd vocab in wikitext vocab 
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [7]:
wgts['0.encoder.weight'] = T(new_w)

#because of the way we do embedding dropout we need a seperate copy
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))

#the decoder uses same weights 
wgts['1.decoder.weight'] = T(np.copy(new_w))


### Language Model

In [8]:
wd = 1e-7
bptt = 50
bs = 30
opt_fn = partial(optim.Adam, betas=(0.8,0.99))

t= len(np.concatenate(trn_lm))

In [9]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, len(itos), trn_dl, val_dl, bs=bs, bptt=bptt)

In [10]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1)

In [11]:
learner.model.load_state_dict(wgts)

In [12]:
lr=1e-3
lrs = lr
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)


HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss   accuracy                       
    0      4.755649   4.505801   0.255577  



[array([4.5058]), 0.25557697920344635]

In [13]:
learner.save('lm_last_ft')


In [14]:
learner.load('lm_last_ft')


In [None]:
learner.unfreeze()
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)
learner.sched.plot()

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

 26%|██▌       | 4328/16686 [14:47<42:14,  4.88it/s, loss=4.65]

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=15)


In [None]:
#save two different models becasue we will only use the rnn_enc portion 
#the decode is what makes it a language model 
learner.save('lm1')
learner.save_encoder('lm1_enc')


In [None]:
learner.sched.plot_loss()


### Tokenizing and Numericalizing Classification Model 


In [None]:
df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, 
                     chunksize=chunksize)
df_val = pd.read_csv(CLAS_PATH/'test.csv', header=None, 
                     chunksize=chunksize)
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

In [None]:
(CLAS_PATH/'tmp').mkdir(exist_ok=True)
np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)
np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)


In [None]:
tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')
itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos) 

In [None]:
trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])
val_clas = np.array([[stoi[o] for o in p] for p in tok_val])
np.save(CLAS_PATH/'tmp'/'trn_ids.npy', trn_clas)
np.save(CLAS_PATH/'tmp'/'val_ids.npy', val_clas)

### Classifier

In [None]:
trn_clas = np.load(CLAS_PATH/'tmp'/'trn_ids.npy')
val_clas = np.load(CLAS_PATH/'tmp'/'val_ids.npy')

trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'val_labels.npy'))