In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.text import *
import numpy as np
import pickle
import sentencepiece as spm
from tqdm import tqdm

In [3]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [4]:
!nvidia-smi

Sat Aug  8 08:24:03 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 430.64       Driver Version: 430.64       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-PCIE...  Off  | 00000001:00:00.0 Off |                    0 |
| N/A   34C    P0    41W / 250W |     11MiB / 16160MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [5]:
torch.cuda.set_device(0)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma-2/language_model


In [7]:
path = Path('./')

In [8]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower()

In [9]:
class CodeMixedMalayalamTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/mlen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [10]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/mlen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(15000)]

In [11]:
len(itos)

15000

In [12]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 '▁the',
 ',',
 'ർ',
 'ൽ',
 's',
 'ൻ',
 '▁of',
 '▁',
 'ൾ',
 '▁in']

In [13]:
# 15,000 is the vocab size that we chose in sentencepiece
mlen_vocab = Vocab(itos)

In [14]:
tokenizer = Tokenizer(lang='mlen', tok_func=CodeMixedMalayalamTokenizer)

In [15]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [16]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [17]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])

In [18]:
''.join(tokens[0])

'▁tell▁me▁about▁tour▁self,▁mujhe▁jaanna▁hai'

In [19]:
path

PosixPath('.')

In [20]:
data_lm = TextLMDataBunch.from_folder(path=path/'../dataset_preparation', train='train_uncased' , valid='valid_uncased', vocab=mlen_vocab, tokenizer=tokenizer)

In [21]:
data_lm.batch_size

64

In [22]:
# data_lm.save()

In [23]:
data_lm.show_batch()

idx,text
0,▁the ▁fort ▁is ▁spread ▁over ▁an ▁area ▁of ▁about ▁7 ▁acres ▁at ▁a ▁height ▁of ▁about ▁150 ▁feet ▁above ▁sea ▁level . ▁etanu ▁nutanduk ൾ kku ▁mu ൻ p ▁chandragiri ▁pune ▁kola ttu nadi nteyu ▁tu lu nadi nteyu ▁ati ർ tti airunnu . ▁tu lu nadi n ▁vijaynagar ▁samrajyam ▁kilo tak i appo ൾ ▁kola ttu rajakanma ർ k ▁chandragiri ute ▁a dhi sh tva ▁nashtapettu . ▁16 -
1,"de ▁in ▁tamil ▁as ▁bharathan ▁aa vara m ▁poo . ▁e ൺ patu kaute ▁tutkkati ൽ ▁bharat ൻ ▁pal ▁yu g ma ▁chalachchitra slu ▁ni ർ mmitchu . ▁‘ cha maram , ▁m ർ m maram , ▁pass ng ൾ , ▁ente ▁upa san ' ▁enniv ▁iti ൽ ▁chilat an . ▁iv ▁kalapa r mai ▁aduttu parya tta kka v ▁allengi l ▁vanijy ▁vijaya ng ൾ ▁irunnu . ▁ma lyalachalachitra"
2,"▁high ▁cho le ster ol ▁in ▁the ▁body . ▁regular ▁exercise ▁and ▁diet ▁can ▁lower ▁cho le ster ol . ▁for ▁some , ▁medica tion ▁may ▁be ▁effective . ▁this ▁condition ▁can ▁be ▁detect ed ▁by ▁accident ▁or ▁duri ng ▁an ▁ult ra so und ▁sc an ▁of ▁the ▁abdomen . ▁live r ▁function ▁test s ▁that ▁examine ▁the ▁function ▁of ▁the ▁live r ▁can ▁help ▁determine ▁the ▁spread ▁of"
3,"▁on ▁28 ▁february ▁2013 ▁due ▁to ▁in ability ▁to ▁fulfill ▁his ▁res po n si bi l ities ▁due ▁to ▁old ▁age . ▁ce le s tine ▁v , ▁who ▁resigned ▁in ▁12 94 , ▁was ▁the ▁last ▁pope ▁to ▁resign ▁vol unt ari ly . ▁after ▁that , ▁gre go rio s ▁ xi i i ▁resigned ▁at ▁the ▁end ▁of ▁the ▁in famous ▁western ▁partition , ▁but ▁was"
4,s ▁ahl ▁al - bay t ▁khu ർ aa ൻ ▁• ▁nabi charya ▁• ▁hadith ▁fi kh h ▁• ▁sharia t ▁hanafi ▁malik i ▁sha fi ▁hambali ▁sunni ▁• ▁shia ▁suphy ▁• ▁salph y ▁prasthana ▁masjid ▁al - haram ▁• ▁masjid ▁al - na ba wi ▁masjid ▁al - a q sa ▁kal ▁• ▁tat va chint ▁vastuvidy ▁• ▁muslim ▁palli k ൾ ▁hijera ▁wa ർ sha ▁• ▁aaghosha ng


In [24]:
len(data_lm.train_dl)

1991

In [25]:
len(data_lm.valid_dl)

948

In [26]:
len(data_lm.vocab.itos)

15000

In [27]:
learn = language_model_learner(data_lm, AWD_LSTM, pretrained=False)

In [28]:
gc.collect()

0

In [29]:
learn.model

SequentialRNN(
  (0): AWD_LSTM(
    (encoder): Embedding(15000, 400, padding_idx=1)
    (encoder_dp): EmbeddingDropout(
      (emb): Embedding(15000, 400, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(400, 1152, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(1152, 1152, batch_first=True)
      )
      (2): WeightDropout(
        (module): LSTM(1152, 400, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
      (2): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=400, out_features=15000, bias=True)
    (output_dp): RNNDropout()
  )
)

In [30]:
learn.fit_one_cycle(10, 1e-2, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='valid_loss', name='best_model')])

epoch,train_loss,valid_loss,accuracy,time
0,5.276674,5.313937,0.241897,03:19
1,4.879024,4.787274,0.273538,03:19
2,4.699088,4.745184,0.274538,03:19
3,4.659143,4.548,0.293113,03:22
4,4.380394,4.373798,0.311567,03:19
5,4.425856,4.213296,0.327607,03:19
6,4.068779,4.044174,0.347572,03:19
7,3.885477,3.908516,0.363612,03:19
8,3.86753,3.838514,0.372529,03:19
9,3.756617,3.825342,0.374248,03:19


Better model found at epoch 0 with valid_loss value: 5.313937187194824.
Better model found at epoch 1 with valid_loss value: 4.787274360656738.
Better model found at epoch 2 with valid_loss value: 4.745183944702148.
Better model found at epoch 3 with valid_loss value: 4.547999858856201.
Better model found at epoch 4 with valid_loss value: 4.37379789352417.
Better model found at epoch 5 with valid_loss value: 4.213296413421631.
Better model found at epoch 6 with valid_loss value: 4.0441741943359375.
Better model found at epoch 7 with valid_loss value: 3.9085159301757812.
Better model found at epoch 8 with valid_loss value: 3.8385143280029297.
Better model found at epoch 9 with valid_loss value: 3.8253417015075684.


In [31]:
learn.load('best_model')

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (17336 items)
x: LMTextList
▁x x bo s ▁chandragiri ▁fort ▁is ▁located ▁on ▁the ▁banks ▁of ▁the ▁chandragiri ▁river , ▁southeast ▁of ▁kasaragod ▁district ▁in ▁north ▁kerala . ▁the ▁ru in ed ▁fort ▁is ▁a ▁beautiful ▁wind ow ▁to ▁the ▁river , ▁the ▁arabia n ▁sea ▁and ▁the ▁coconut ▁gro ves . ▁built ▁in ▁the ▁17 th ▁century , ▁the ▁fort ▁is ▁an ▁important ▁place ▁for ▁students ▁of ▁history ▁and ▁arch e ology . ▁the ▁fort ▁is ▁spread ▁over ▁an ▁area ▁of ▁about ▁7 ▁acres ▁at ▁a ▁height ▁of ▁about ▁150 ▁feet ▁above ▁sea ▁level . ▁etanu ▁nutanduk ൾ kku ▁mu ൻ p ▁chandragiri ▁pune ▁kola ttu nadi nteyu ▁tu lu nadi nteyu ▁ati ർ tti airunnu . ▁tu lu nadi n ▁vijaynagar ▁samrajyam ▁kilo tak i appo ൾ ▁kola ttu rajakanma ർ k ▁chandragiri ute ▁a dhi sh tva ▁nashtapettu . ▁16 - aam ▁nutand o e ▁vijaynagar ▁samrajyatine ▁shakti ▁ kshay ichu . ▁pinnite ▁be da nnu ർ ▁naikk anma ർ ▁ennariapadunn ▁i kke ri ▁naikk anma ർ ▁chandragiri ▁oru ▁swatantra ▁prad

In [32]:
TEXT = "മൈ നെയിം ഈസ്"
N_WORDS = 40
N_SENTENCES = 2

In [33]:
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.9) for _ in range(N_SENTENCES)))

മൈ നെയിം ഈസ് ൻ ? ▁ellam ▁kanunn ▁" fa ct ▁hi ൽ s ti s " ▁athva ▁i ▁pha l helm . ▁ dic ▁ja ൻ s , ▁a mp dy ൻ ▁ennigyne ▁na ൽ ka ppetunnat an . ▁ite ▁pinnite ▁sai fi
മൈ നെയിം ഈസ് ▁ ൻ ▁ , ▁ , a , ൻ , ▁ , - ▁ , ▁ , , , y . ▁matra ▁ennat ▁ennat ▁paddy ▁oru ▁pratyek ▁yantra ▁sam gri lute ▁savisheshat ▁enniva ▁pradhanpat ▁integrated at ▁athva ▁modern ▁us e


In [53]:
np.exp(3.825342)

45.84847786475817

In [35]:
defaults.device = torch.device('cpu')
learn.model.eval()
learn.export()

In [36]:
path

PosixPath('.')

In [37]:
encoder = get_model(learn.model)[0]

In [38]:
encoder.state_dict()['encoder.weight'].shape

torch.Size([15000, 400])

In [39]:
embeddings = encoder.state_dict()['encoder.weight']

In [40]:
embeddings = np.array(embeddings)

In [41]:
embeddings[0].shape

(400,)

In [42]:
df = pd.DataFrame(embeddings)

In [43]:
df.shape

(15000, 400)

In [44]:
df.to_csv('ulmfit_embeddings.tsv', sep='\t', index=False, header=False)

In [45]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,-0.433251,0.665263,0.378461,0.113008,-0.752892,0.607241,0.639262,0.0293,0.546293,0.011721,...,-0.400866,0.947655,0.491379,-0.141187,-0.025121,0.341715,0.395753,0.017537,0.3106,-0.256061
1,-0.119318,-0.046619,-0.106872,0.152677,0.211681,0.402921,0.264479,0.188654,0.24887,-0.022151,...,0.307206,0.438816,0.422814,-0.194155,-0.254277,-0.130695,0.052285,-0.129378,-0.459578,-0.228301
2,-0.097627,-0.042005,-0.076035,0.193454,0.264262,0.439931,0.303255,0.169267,0.248516,-0.019618,...,0.330348,0.397409,0.395272,-0.202634,-0.254562,-0.162562,0.029184,-0.122755,-0.471244,-0.218725
3,-0.122897,-0.010458,-0.069576,0.174884,0.191747,0.428591,0.281517,0.154555,0.252363,-0.028922,...,0.303281,0.411593,0.443173,-0.204942,-0.211248,-0.143436,0.035291,-0.158781,-0.44912,-0.246072
4,-0.131412,-0.065176,-0.118555,0.17579,0.228001,0.417385,0.270769,0.178421,0.247588,-0.013445,...,0.296731,0.444142,0.420692,-0.206707,-0.253003,-0.122248,0.038835,-0.153473,-0.45616,-0.217235


In [46]:
df.shape

(15000, 400)

In [47]:
len(itos)

15000

In [48]:
df2 = pd.DataFrame(itos)

In [49]:
df2.head()

Unnamed: 0,0
0,xxunk
1,xxbos
2,xxeos
3,xxpad
4,xxfld


In [50]:
df2.shape

(15000, 1)

In [51]:
df2.to_csv('ulmfit_embeddings_metadata.tsv', sep='\t', index=False, header=False)

In [52]:
encoder.state_dict()['encoder.weight'][1]

tensor([-1.1932e-01, -4.6619e-02, -1.0687e-01,  1.5268e-01,  2.1168e-01,
         4.0292e-01,  2.6448e-01,  1.8865e-01,  2.4887e-01, -2.2151e-02,
         2.0020e-01, -4.4919e-01, -7.2489e-01,  1.1016e-01, -2.6833e-01,
        -4.9403e-02, -1.1349e-01, -4.5694e-01, -3.9021e-01,  7.0770e-02,
        -1.7506e-01, -3.8086e-01,  4.8634e-01, -2.1282e-01,  9.2620e-02,
         3.6558e-01,  3.8753e-01,  2.8944e-01,  2.4780e-01, -2.4298e-01,
        -2.0763e-01,  3.5341e-01, -6.1301e-01, -3.8906e-01, -2.7930e-01,
        -4.4005e-01, -4.8606e-01, -4.5191e-01,  3.0609e-01, -1.6278e-01,
         2.4819e-01, -2.4312e-01,  3.3502e-01, -3.6213e-01,  1.8861e-01,
        -3.3217e-01, -2.2812e-01, -4.3693e-01, -2.0935e-01,  1.6422e-01,
         4.7515e-01, -3.9716e-01,  2.5674e-01, -2.4196e-01, -2.1071e-01,
        -4.8333e-01,  1.7961e-01,  3.6507e-01,  3.8374e-01,  2.8003e-01,
         2.8068e-01, -8.3660e-02, -1.9428e-01,  3.1938e-01, -3.4581e-02,
         2.5673e-01, -1.9477e-01,  1.5686e-01,  2.7