# Set ups

In [1]:
from fastai.text.all import Path, DataBlock, TextBlock, ColReader, RandomSplitter, \
                                    CategoryBlock,language_model_learner, AWD_LSTM, text_classifier_learner,valley, slide
import pandas as pd
import pickle
import torch
torch.manual_seed(0)

<torch._C.Generator at 0x7f970e70fb50>

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
path = Path("/content/gdrive/MyDrive/sent")
path.mkdir(parents=True, exist_ok=True)

# Reading raw data and saving the final table

In [None]:
# Raw comments
df = pd.read_excel("/content/gdrive/MyDrive/sentipers.xlsx")
df.head()

Unnamed: 0,index,sid,text,polarity,file
0,0,rev-1,اینک قصد داریم پرینتر دیگری از پرینترهای لیزری کمپانی Hp را معرفی کنیم.,0,data/main/HP LaserJet M1132.xml
1,1,rev-2,پرینتری چند کاره از رده‌ی Entry Level یا سطح مبتدی.,0,data/main/HP LaserJet M1132.xml
2,2,rev-3,به هر صورت اکنون ما در دنیایی زندگی می‌کنیم، که کاربران پرینترها انتظارات بالاتری علاوه بر گرفتن پرینت ساده از دستگاه خود دارند.,0,data/main/HP LaserJet M1132.xml
3,3,rev-4,به صورتی که توانایی کپی کردن، اسکن، فکس، پرینت عکس، پرینت دورو، قابلیت اتصال از طریق Bluetooth و WiFi را نیز باید داشته باشد.,0,data/main/HP LaserJet M1132.xml
4,4,rev-5,به هر صورت معمولا چیزی که بیشتر کاربران از پرینتری پر کار در این سطح قیمت برای خانه و یا دفتر کار انتظار دارند، تولید پرینت های با کیفیت بالا، با سرعت زیاد و البته هزینه‌ی نگهداری پایین است.,2,data/main/HP LaserJet M1132.xml


In [None]:
def id(x):
   """
   Tagging comments based on polarity column.
   positive = 2
   negative = 0
   neutral = 1
   """
   if x > 0:
     r = 2
   elif x < 0:
     r = 0
   else: 
     r = 1
   return r
   
df['target'] = df.polarity.apply(lambda x: id(x))

In [None]:
df_f = df[['text', 'target']] # Clean the dataset
with open(path/'df', 'wb') as f: pickle.dump(df_f, f) # Save the cleaned dataset

# training a language model

In [4]:
def loader(df_path, is_lm= True, back=False, txt_c="text",target="target", 
           vocab_dir = None,
           seq_len=72, bs=64, sp_pr=0.01):
    if is_lm:
        # Dataloder for language model
        if (df_path.parent/f'dss_{back}.pkl').exists(): 
            res = torch.load(df_path.parent/f'dss_{back}.pkl').dataloaders(bs=bs)
        else:
            df = pickle.load(open(df_path,'rb'))
            dss = DataBlock(TextBlock.from_df(txt_c, is_lm=is_lm, 
                                              seq_len=seq_len, backwards=back)
                  ,get_items=ColReader(txt_c)
                  ,splitter= RandomSplitter(sp_pr)
                  ).datasets(df)
            torch.save(dss, df_path.parent/f"dss_{back}.pkl")
            res = dss.dataloaders(bs = bs)
        
    else:
        # Dataloder for sentiment analysis
        if vocab_dir is None:
            vocab = pickle.load(open(df_path.parent/'vocab_sen','rb'))
        else: vocab = pickle.load(open(vocab_dir,"rb"))
        df = pickle.load(open(df_path,'rb'))
        res = DataBlock(blocks=[TextBlock.from_df(txt_c,
                                                  vocab=vocab,
                                                  seq_len=seq_len,
                                                  is_lm=False, backwards=back),
                          CategoryBlock], 
                  get_x=ColReader(txt_c), 
                  get_y=ColReader(target),
                  splitter= RandomSplitter(sp_pr)).dataloaders(df, bs=bs)

    return res

dls = loader(path/'df',bs=128)

In [5]:
# Pre-trained model on wikipedia and it's list of vocabularies
fname = ["/content/gdrive/MyDrive/fa_func19_w.pth",
          "/content/gdrive/MyDrive/vocab_fa_func19"]

def lm(dls ,fname, is_lm=True, mal=0.1,encoder_dir=None):
    if is_lm:
        # language model learner
        lm= language_model_learner(dls, AWD_LSTM, 
                                  pretrained = False,
                                  drop_mult=mal, 
                                  metrics=[accuracy, perplexity]).to_fp16()

        lm.load_pretrained(*fname)
    else:
        # load pre-trained language model encoder
        encoder_dir = (path/'encoder') if encoder_dir is None else encoder_dir
        # text classifier learner
        lm = text_classifier_learner(dls_m, AWD_LSTM, 
                                  pretrained = False,
                                  drop_mult=mal, 
                                  metrics=[accuracy, perplexity]).to_fp16()
    return lm



# Fine tuen model

In [None]:
model = lm(dls, fname,mal=0.6)
model.freeze_to(-1)
model.fit_one_cycle(1, 1e-1,moms=(0.8,0.7,0.8))
model.unfreeze()
model.fit_one_cycle(8, slice(5e-3, 11e-2),moms=(0.8,0.7,0.8))

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.984418,4.120013,0.259766,61.560059,00:28
1,3.942671,4.611005,0.22433,100.58522,00:27
2,3.815038,4.103494,0.275949,60.551495,00:28
3,3.523463,3.910339,0.296596,49.915874,00:30
4,3.183938,3.843918,0.328404,46.708099,00:27
5,2.81499,3.791279,0.342634,44.313053,00:28
6,2.452304,3.797991,0.353237,44.611481,00:30
7,2.193605,3.821512,0.357422,45.673237,00:28


# save language model

In [None]:
def save(dls, model,path=path):
    if (path/'vocab_sen00').exists(): (path/'vocab_sen00').unlink()
    if (path/'encoder_f_00').exists(): (path/'encoder_f00').unlink()
    with open(path/'vocab_sen_00','wb') as v : pickle.dump(dls.vocab, v)
    model.save_encoder(path/'encoder_00')

save(dls, model)

# Train classifier

In [253]:
dls_m = loader(path/'df',is_lm=False,bs=64)
m = lm(dls_m, None,False,mal=0.3,encoder_dir="/content/gdrive/MyDrive/sent/encoder.pth")
m.freeze_to(-1)
m.fit_one_cycle(2,4e-2,moms=(0.8,0.7,0.8))
m.unfreeze()
m.fit_one_cycle(8, slice(6e-5, 4e-2), moms=(0.8,0.7, 0.8),wd=0.2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,0.89458,1.894804,0.538462,6.651241,00:19
1,0.804955,8.056778,0.410256,3155.10791,00:19
2,0.724384,1.872004,0.634615,6.501309,00:19
3,0.642418,1.021325,0.660256,2.77687,00:18
4,0.588705,1.474695,0.685897,4.369703,00:19
5,0.544844,1.515551,0.730769,4.551929,00:18
6,0.506347,2.47421,0.794872,11.872322,00:18
7,0.480327,1.29456,0.788462,3.649389,00:18


In [255]:
m.export()