In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

In [2]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [3]:
torch.cuda.set_device(0)

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [5]:
random_seed(42, True)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma/classification


In [7]:
path = Path('./')

In [8]:
df_train = pd.read_csv(path/'../hasoc_task_1/ml-Hasoc-offensive-train.csv', sep='\t', header=None)
df_train.head()

Unnamed: 0,0,1
0,Not_offensive,Ithu ikkayude script aanu.. Uttopiya
1,Not_offensive,Varunnathu manthriyo bhadano alla rajavanu ra...
2,Not_offensive,ഇഷ്ടമാണ്. But ചില സിനിമയിൽ over actigalle എന്...
3,Not_offensive,enna look a rajuettaaaaa............ promisin...
4,Offensive,ഇത് മൂഞ്ചും ഉറപ്പ് എന്ത് ഊള ട്രൈേലർ


In [9]:
df_valid = pd.read_csv(path/'../hasoc_task_1/ml-Hasoc-offensive-dev.csv', sep='\t', header=None)
df_valid.head()

Unnamed: 0,0,1
0,Offensive,നല്ല ഊമ്പിയ bgm ടു ട്ടു ടു ട്ടൂ...
1,Offensive,Lucifer njngal randum kayyum neeti sweekarich...
2,Not_offensive,Evideo oru Hollywood story varunnilleee. Oru ...
3,Not_offensive,ithre ullo mattavanmarude power 🤣🤣🤣 dislike d...
4,Not_offensive,Prathi poovan kozhi teaser kandittu


In [10]:
df_test = pd.read_csv(path/'../hasoc_task_1/ml_mixedscript_Hascoc_offensive_test_without_label.csv', header=None)
df_test.head()

Unnamed: 0,0,1
0,ml_1,Theatoril climax maathram kaanichal mathiyallo...
1,ml_2,Shah Rukh Khan inte FAN cinema de cheriya samy...
2,ml_3,Heavy Stills onnum oru rekshem illa adipoli fd...
3,ml_4,Eee trailer njan ethra pravishyam nokiyann eni...
4,ml_5,Ikka ethu engane sathikunu enna oru mass I lov...


In [11]:
df_train.shape, df_valid.shape, df_test.shape

((3200, 2), (400, 2), (400, 2))

In [12]:
Counter(df_train[0])

Counter({'Not_offensive ': 2633, 'Offensive': 567})

In [13]:
Counter(df_valid[0])

Counter({'Not_offensive ': 328, 'Offensive': 72})

In [14]:
df_train = pd.concat([df_train, df_valid])
df_train.shape

(3600, 2)

In [15]:
label_cols = [0]

In [16]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower()

In [17]:
class CodeMixedMalayalamTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/mlen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [18]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/mlen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(25000)]

In [19]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 'ൽ',
 '▁the',
 'ർ',
 '▁',
 'ൻ',
 's',
 '▁•',
 '▁of',
 'ൾ']

In [20]:
# 25,000 is the vocab size that we chose in sentencepiece
mlen_vocab = Vocab(itos)

In [21]:
tokenizer = Tokenizer(lang='mlen', tok_func=CodeMixedMalayalamTokenizer)

In [22]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [23]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [24]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])
''.join(tokens[0])

'▁tell▁me▁about▁tour▁self,▁mujhe▁jaanna▁hai'

In [25]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab)

In [26]:
data_lm.show_batch()

idx,text
0,▁a ▁ xxrep ▁12 ▁. ▁pro mis ing ▁tra il or ▁x x bo s ▁ഇത് ▁മൂ ഞ്ച ും ▁ഉറപ്പ ് ▁എന്ത ് ▁ഊ ള ▁ട്രൈ േ ല ർ ▁x x bo s ▁i k kha ▁me s s ▁a anu ▁ xxunk ▁mam ook kha ▁ xxunk ▁d q ▁x x bo s ▁സു രാജ േ ട്ട ന് ▁കോമഡി ▁മാത്രമല്ല ▁സ ീരിയ സ് ▁റോ ള ുകളും ▁വ ഴ ങ്ങ
1,bo s ▁1 ▁divas am ▁3 l ▁ku du tal ▁tavan e ▁kan una var ▁a ro kke ▁und ▁x x bo s ▁ni na kki ni ▁malayalam ▁in d ru sti yil ▁ki da nn ▁po la kka an ▁pattu menn ▁tho nn unnund o ▁bo sse ▁enna ▁chodya th inu ▁oru ▁a da ar ▁maru padi ▁pra the ek shichu ▁x x bo s ▁tha rik ida ▁sab
2,▁kanda ▁malaya ly ▁chu nk s ▁ud enkil ▁ xxrep ▁10 ▁. ▁adi ▁oru ▁like ▁x x bo s ▁adi poli ▁mass ▁super b ▁ki du ▁mammootty ▁poli ch ▁ xxrep ▁4 ▁ xxunk ▁mass ▁a ay itund ▁trai ler ▁pa kka ▁poli ▁x x bo s ▁i thu ▁raja yu de ▁3 ▁strong ▁all a ▁et tan de ▁lu ci fer ▁x x bo s ▁ചേ ട്ട ന്മാര േ
3,▁ xxrep ▁6 ▁. ▁x x bo s ▁kala kki ▁i thu ▁box ▁of i ce ▁a aghosha m ▁a ayirik kum ▁x x bo s ▁എത്ര ▁കണ്ട ിട്ടും ▁മതി യാവ ാത്ത ▁ഒരു ▁അ ▁ടാ ർ ▁it em ▁x x bo s ▁b g m ▁uru ▁raksha ▁ill ▁ xxrep ▁4 ▁a ▁ xxrep ▁4 ▁. ▁su shi n ▁shyam ▁touch ▁pole . ▁x x bo s ▁de c ▁12
4,▁x x bo s ▁5 ▁million ▁a yo ▁ne ▁no kkan ▁vann avar ▁e thra ▁paru ▁n de ▁x x bo s ▁ചാനല ിന്റെ ▁പേര് ▁ശ്രദ്ധ ിച്ച വർ ക്കു ▁ലൈ ക്ക് ▁അടി ക്കാനുള്ള ▁സ്ഥലം . . . ▁x x bo s ▁enik um ▁ente ▁family ▁k kum ▁ishtappett illa . . . ▁dialogue s ▁onnu m ▁prop er ▁a ayi ▁mana s ila ayi lla . . ▁ njan ▁prakasha


In [27]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, pretrained=False)

In [28]:
# Loading the pretrained language model on hindi wikipedia
learn.load('../../dataset_preparation/models/best_model', with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (3600 items)
x: LMTextList
▁x x bo s ▁i thu ▁ ikkayu de ▁script ▁a anu . . ▁utt op iya,▁x x bo s ▁varunnat hu ▁man th ri yo ▁bha da no ▁all a ▁raja vanu ▁raja vu,▁x x bo s ▁ഇഷ്ട മാണ് . ▁but ▁ചില ▁സിനിമയിൽ ▁over ▁act i gal le ▁എന്ന് ▁തോന്ന ും . .,▁x x bo s ▁enna ▁look ▁a ▁raj u e tt ▁ xxrep ▁5 ▁a ▁ xxrep ▁12 ▁. ▁pro mis ing ▁tra il or,▁x x bo s ▁ഇത് ▁മൂ ഞ്ച ും ▁ഉറപ്പ ് ▁എന്ത ് ▁ഊ ള ▁ട്രൈ േ ല ർ
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (400 items)
x: LMTextList
▁x x bo s ▁നല്ല ▁ഊ മ്പി യ ▁b g m ▁ടു ▁ ട്ടു ▁ടു ▁ ട്ട ൂ . . .,▁x x bo s ▁lu ci fer ▁ nj ngal ▁randu m ▁kay yu m ▁ne eti ▁swe e kar ichu . . ▁marichu . . ▁math am ▁parayunn a ▁sang i kal ▁ke ri ▁dis like ▁adukk uva aa . . . ▁oru ▁my rum ▁nada kkulla ▁ xxrep ▁5 ▁.,▁x x bo s ▁e vid eo ▁oru ▁hollywood ▁story ▁varunn ille e e . ▁oru ▁d b t .,▁x x bo s ▁i th re ▁ull o ▁mat tavan maru de ▁power ▁ xxunk ▁dis like ▁dis like,▁x x bo s ▁pra thi ▁poo van ▁ko zhi ▁tea s er ▁kand ittu

In [29]:
# Fine tuning the prtrained LM on current dataset

In [30]:
learn.lr_find()

epoch,train_loss,valid_loss,accuracy,time
0,6.475304,#na#,00:02,
1,6.444173,#na#,00:02,
2,6.067534,#na#,00:02,
3,5.764607,#na#,00:02,


LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


In [31]:
learn.recorder.plot()

In [32]:
learn.freeze()

In [33]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.785672,5.180454,0.227307,00:01


In [34]:
learn.save('fit_head', with_opt=True)

In [35]:
learn.load('fit_head', with_opt=True);

In [36]:
learn.unfreeze()

In [37]:
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.175614,4.917291,0.246577,00:02
1,4.909992,4.443356,0.294196,00:02
2,4.650443,4.187945,0.319345,00:02
3,4.44993,4.070615,0.33006,00:02
4,4.321851,4.05244,0.331548,00:02


In [38]:
learn.save('fine_tuned', with_opt=True)

In [39]:
learn.load('fine_tuned', with_opt=True);

In [40]:
learn.predict('Evideo oru Hollywood story',n_words=10)

'Evideo oru Hollywood story w ill ▁negative ▁ ▁. ▁oru ▁avatara ▁pirann'

In [41]:
learn.save_encoder('fine_tuned_enc')

In [42]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab, bs=16)

In [43]:
data_clas.show_batch()

text,target
▁x x bo s ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man ju ▁war ri er ▁man,Not_offensive
▁x x bo s ▁നാ ണ മ ുണ്ടോ ടാ ▁ചെ റ്റ കളെ ▁ഇ മ്മാ തിരി ▁ഊ ള ▁പരിപാടി ▁കാണിക്ക ാൻ ▁ഒരു ▁ഊ ള ▁ട്രൈ ലെ ർ ▁അത് ▁നി ങ്ങൾക്കു തന്നെ ▁അറിയ ാം ▁എന്ന ിട്ടും ▁. ▁ഈ ▁പട മ ൊക്കെ ▁ഇറങ്ങ ുന്നത് ▁തന്നെ ▁ആർക്കും ▁അറിയ ില്ല ▁. ▁തീ യേറ്റ റിൽ ▁എത്തിയ ാൽ ▁ആളുകൾ ▁എടുത്ത ിട്ട ല ക്കും ▁അതു കഴിഞ്ഞ ാണ് ▁ആ മിന താ ത്താ ന്റെ ▁വരവ ് ▁ചരിത്രം ▁എങ്ങനെ ▁കാണിക്ക ുമെന്ന്,Offensive
▁x x bo s ▁i th inum ▁matra m ▁nall a ▁review ▁ki tta a an ▁en thu ▁the nga yaa ▁e e ▁pada thi lulla the nn ▁enik ▁maa tra ano ▁tho nn iya th . . ▁aake ▁rasa ma ayi tt ▁tho nn iya th ▁2 . . . ▁3 ▁scene s ▁matra m . . ▁oru ▁50 ▁percent ▁dia log ▁onnu m ▁enik ▁manassil a ayi lla,Offensive
▁x x bo s ▁i th inum ▁matra m ▁nall a ▁review ▁ki tta a an ▁en thu ▁the nga yaa ▁e e ▁pada thi lulla the nn ▁enik ▁maa tra ano ▁tho nn iya th . . ▁aake ▁rasa ma ayi tt ▁tho nn iya th ▁2 . . . ▁3 ▁scene s ▁matra m . . ▁oru ▁50 ▁percent ▁dia log ▁onnu m ▁enik ▁manassil a ayi lla,Offensive
▁x x bo s ▁ xxunk p rit vi ▁recent ▁interview ▁story ▁tre ad ▁par j irunnu ▁e th nd u ▁main ▁katha ▁th nn ea ▁app ol ▁vi ch rich tha ▁e ger u th nn ea ▁all ea ▁producer ▁ennu ▁enni tum ▁e ntha ▁e thu ▁ok ▁par nju ▁pada the ▁kollu n thu ▁ennu ▁e e ▁ci mai l ▁ ulla ▁age ru da ▁con fi d,Not_offensive


In [44]:
data_clas.sanity_check()

In [45]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5)

In [46]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3600 items)
x: TextList
▁x x bo s ▁i thu ▁ ikkayu de ▁script ▁a anu . . ▁utt op iya,▁x x bo s ▁varunnat hu ▁man th ri yo ▁bha da no ▁all a ▁raja vanu ▁raja vu,▁x x bo s ▁ഇഷ്ട മാണ് . ▁but ▁ചില ▁സിനിമയിൽ ▁over ▁act i gal le ▁എന്ന് ▁തോന്ന ും . .,▁x x bo s ▁enna ▁look ▁a ▁raj u e tt ▁ xxrep ▁5 ▁a ▁ xxrep ▁12 ▁. ▁pro mis ing ▁tra il or,▁x x bo s ▁ഇത് ▁മൂ ഞ്ച ും ▁ഉറപ്പ ് ▁എന്ത ് ▁ഊ ള ▁ട്രൈ േ ല ർ
y: CategoryList
Not_offensive ,Not_offensive ,Not_offensive ,Not_offensive ,Offensive
Path: .;

Valid: LabelList (400 items)
x: TextList
▁x x bo s ▁നല്ല ▁ഊ മ്പി യ ▁b g m ▁ടു ▁ ട്ടു ▁ടു ▁ ട്ട ൂ . . .,▁x x bo s ▁lu ci fer ▁ nj ngal ▁randu m ▁kay yu m ▁ne eti ▁swe e kar ichu . . ▁marichu . . ▁math am ▁parayunn a ▁sang i kal ▁ke ri ▁dis like ▁adukk uva aa . . . ▁oru ▁my rum ▁nada kkulla ▁ xxrep ▁5 ▁.,▁x x bo s ▁e vid eo ▁oru ▁hollywood ▁story ▁varunn ille e e . ▁oru ▁d b t .,▁x x bo s ▁i th re ▁ull o ▁mat tavan maru de ▁power ▁ xxunk ▁dis like ▁dis li

In [47]:
learn.freeze()

In [48]:
learn.loss_func.func

CrossEntropyLoss()

In [49]:
mcc = MatthewsCorreff()

In [50]:
learn.metrics = [mcc, accuracy]

In [51]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.417806,0.334152,0.429945,0.86,00:02


In [52]:
learn.save('first-full')

In [53]:
learn.load('first-full');

In [54]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.321491,0.223157,0.687972,0.915,00:03


In [55]:
learn.save('second-full')

In [56]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.246178,0.178636,0.776569,0.9375,00:05
1,0.213831,0.085446,0.922528,0.9775,00:05
2,0.150778,0.04723,0.965953,0.99,00:05
3,0.11208,0.030141,0.974499,0.9925,00:05
4,0.080028,0.02811,0.983021,0.995,00:05


Better model found at epoch 0 with accuracy value: 0.9375.
Better model found at epoch 1 with accuracy value: 0.9775000214576721.
Better model found at epoch 2 with accuracy value: 0.9900000095367432.
Better model found at epoch 3 with accuracy value: 0.9925000071525574.
Better model found at epoch 4 with accuracy value: 0.9950000047683716.


In [57]:
learn.load('final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3600 items)
x: TextList
▁x x bo s ▁i thu ▁ ikkayu de ▁script ▁a anu . . ▁utt op iya,▁x x bo s ▁varunnat hu ▁man th ri yo ▁bha da no ▁all a ▁raja vanu ▁raja vu,▁x x bo s ▁ഇഷ്ട മാണ് . ▁but ▁ചില ▁സിനിമയിൽ ▁over ▁act i gal le ▁എന്ന് ▁തോന്ന ും . .,▁x x bo s ▁enna ▁look ▁a ▁raj u e tt ▁ xxrep ▁5 ▁a ▁ xxrep ▁12 ▁. ▁pro mis ing ▁tra il or,▁x x bo s ▁ഇത് ▁മൂ ഞ്ച ും ▁ഉറപ്പ ് ▁എന്ത ് ▁ഊ ള ▁ട്രൈ േ ല ർ
y: CategoryList
Not_offensive ,Not_offensive ,Not_offensive ,Not_offensive ,Offensive
Path: .;

Valid: LabelList (400 items)
x: TextList
▁x x bo s ▁നല്ല ▁ഊ മ്പി യ ▁b g m ▁ടു ▁ ട്ടു ▁ടു ▁ ട്ട ൂ . . .,▁x x bo s ▁lu ci fer ▁ nj ngal ▁randu m ▁kay yu m ▁ne eti ▁swe e kar ichu . . ▁marichu . . ▁math am ▁parayunn a ▁sang i kal ▁ke ri ▁dis like ▁adukk uva aa . . . ▁oru ▁my rum ▁nada kkulla ▁ xxrep ▁5 ▁.,▁x x bo s ▁e vid eo ▁oru ▁hollywood ▁story ▁varunn ille e e . ▁oru ▁d b t .,▁x x bo s ▁i th re ▁ull o ▁mat tavan maru de ▁power ▁ xxunk ▁dis like ▁dis li

In [58]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test[1]), 'actual_label': list(df_test[0]), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[0]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,query,actual_label,predicted_label,Offensive,Not_offensive
0,നല്ല ഊമ്പിയ bgm ടു ട്ടു ടു ട്ടൂ...,Offensive,Offensive,0.999782,0.000217882
1,Lucifer njngal randum kayyum neeti sweekarich...,Offensive,Offensive,0.982383,0.0176174
2,Evideo oru Hollywood story varunnilleee. Oru ...,Not_offensive,Not_offensive,0.0211621,0.978838
3,ithre ullo mattavanmarude power 🤣🤣🤣 dislike d...,Not_offensive,Not_offensive,0.0543507,0.945649
4,Prathi poovan kozhi teaser kandittu,Not_offensive,Not_offensive,0.00234966,0.99765


In [59]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.995

In [60]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

0.9830208371799483

In [61]:
f1_score(df_result['actual_label'], df_result['predicted_label'], labels=['Not_offensive ', 'Offensive'], pos_label='Offensive')

0.9859154929577464

In [62]:
df_test = pd.read_csv(path/'../hasoc_task_1/ml_mixedscript_Hascoc_offensive_test_without_label.csv', header=None)
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'id': list(df_test[0]), 'text': list(df_test[1]), 'label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[0]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,id,text,label,Offensive,Not_offensive
0,ml_1,Theatoril climax maathram kaanichal mathiyallo...,Not_offensive,0.000304782,0.999695
1,ml_2,Shah Rukh Khan inte FAN cinema de cheriya samy...,Not_offensive,0.0220021,0.977998
2,ml_3,Heavy Stills onnum oru rekshem illa adipoli fd...,Not_offensive,0.000517501,0.999483
3,ml_4,Eee trailer njan ethra pravishyam nokiyann eni...,Not_offensive,0.0072398,0.99276
4,ml_5,Ikka ethu engane sathikunu enna oru mass I lov...,Not_offensive,5.54634e-05,0.999945


In [63]:
df_result[df_result['label']=='Offensive'].shape

(64, 5)

In [64]:
df_result.to_csv('test_res_2nd.csv', index=False)

In [65]:
learn.predict('Dislike adikunna mammunikale')

(Category Offensive, tensor(1), tensor([4.1480e-04, 9.9959e-01]))