In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

In [2]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [3]:
torch.cuda.set_device(0)

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [5]:
random_seed(42, True)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma/classification_task_1


In [7]:
path = Path('./')

In [8]:
df_train = pd.read_csv(path/'../dc_fire/malayalam_train.tsv', sep='\t')
df_train.head()

Unnamed: 0,text,category
0,hoo mammokka police vesham aaha anthas,Positive
1,Oru rekshayum illa...kidilam kannu nananjupoyi,Positive
2,Ikka waiting.........,Positive
3,Raju Ettante Oro Shorttum Ijathi ppwli,Positive
4,Ettan fansil netti poya aarenkilum undo? #...,Positive


In [9]:
df_valid = pd.read_csv(path/'../dc_fire/malayalam_dev.tsv', sep='\t')
df_valid.head()

Unnamed: 0,text,category
0,speechless 🤐. ikkaaa,not-malayalam
1,Raja sollunathu mattuthaam seyyvaa seyyunnath...,not-malayalam
2,Im Prithiviraj fan from tamilnadu... Love it,not-malayalam
3,mohanlal sir - look ..... kiddo...,Positive
4,Kandathil vech mungiya pdam Rating 1.1/5,Negative


In [10]:
df_test = pd.read_csv(path/'../dc_fire/malayalam_test.tsv', sep='\t')
df_test.head()

Unnamed: 0,id,text
0,ml_sen_1,Bollywood film Newton inte remake aano?
1,ml_sen_2,endukond viewrs koodunnilla ?? ippozhum 2.8m a...
2,ml_sen_3,Mara paazhu mega mairananil ninnum ethil koodu...
3,ml_sen_4,Video nay cang xem cang thit
4,ml_sen_5,Sunny chechiye kaanan vannathu njan maathram aano


In [11]:
df_train.shape, df_valid.shape, df_test.shape

((4851, 2), (540, 2), (1348, 2))

In [12]:
Counter(df_train['category'])

Counter({'Mixed_feelings ': 289,
         'Negative ': 549,
         'Positive ': 2022,
         'not-malayalam ': 647,
         'unknown_state ': 1344})

In [13]:
Counter(df_valid['category'])

Counter({'Mixed_feelings ': 44,
         'Negative ': 51,
         'Positive ': 224,
         'not-malayalam ': 60,
         'unknown_state ': 161})

In [14]:
df_train = pd.concat([df_train, df_valid])
df_train.shape

(5391, 2)

In [15]:
label_cols = ['category']
text_cols = ['text']

In [16]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower()

In [17]:
class CodeMixedMalayalamTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/mlen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [18]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/mlen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(25000)]

In [19]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 'ൽ',
 '▁the',
 'ർ',
 '▁',
 'ൻ',
 's',
 '▁•',
 '▁of',
 'ൾ']

In [20]:
# 25,000 is the vocab size that we chose in sentencepiece
mlen_vocab = Vocab(itos)

In [21]:
tokenizer = Tokenizer(lang='mlen', tok_func=CodeMixedMalayalamTokenizer)

In [22]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [23]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [24]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])
''.join(tokens[0])

'▁tell▁me▁about▁tour▁self,▁mujhe▁jaanna▁hai'

In [25]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab, label_cols=label_cols, text_cols=text_cols)

In [26]:
data_lm.show_batch()

idx,text
0,x bo s ▁et tan ▁fan sil ▁ne tti ▁po ya ▁a ar enkil um ▁und o xxunk ▁ xxunk ▁mad ura raja ▁ xxunk ▁wait ing ▁ xxunk ▁x x bo s ▁wait ing ▁to ▁see ▁mam mu kaa s ▁unda a ▁ xxunk ▁x x bo s ▁last ▁aa ▁cha di ▁adi ▁ xxrep ▁4 ▁. ▁u ff . ▁fr m ▁an ▁et tan ▁fan ▁x x bo
1,▁chol lan ▁a van ▁varu nu u . . . ▁ xxunk ▁lu ci fer ▁x x bo s ▁positive ▁ki ttiya l ▁pinne ▁para ya nda llo ▁ xxrep ▁4 ▁. ▁et tan ▁fan ▁aa ya ▁ njan ▁than me ▁kannu ▁tha lli ▁poi ▁book ing s ▁kand u ▁x x bo s ▁annu m ▁ennu m ▁ennu m ▁i kka ▁mass ▁ xxunk ▁x x bo s ▁ver
2,▁6000 ▁su scrib e ▁ ulla dhi l ▁it t ▁22 k ▁su scrib e 4 14 k view ▁ 91 k ▁like ▁ xxrep ▁4 ▁a ▁ki yya ▁i kka ▁mass ▁x x bo s ▁ki du ▁b g m ▁for ▁i kka ▁ki du ▁movie ▁super ▁x x bo s ▁mam mo o kka aa ▁ xxrep ▁4 ▁. ▁ni gha l ▁mu tha anu ▁ xxrep ▁4
3,x bo s ▁la le ttan ▁marana ▁mass . . in tha ▁movie ▁ba yang ara ▁hit ▁adi kku m . . . lo ve ▁from ▁tamilnadu ▁x x bo s ▁oru ▁mass ▁political - ▁family ▁block bu ster ▁hit ▁aa ka tte yenn ▁aa sham si kkunnu ▁x x bo s ▁tra il or ▁kand ▁8 ▁nila yil ▁pot tum ▁enn ▁tho n unnu ▁mu mba the ▁pole ▁tho
4,▁chari th ram ▁aka nam enkil ▁mara kkar ▁vara nam ▁x x bo s ▁prithviraj . ▁ni gha l ▁oru ▁marana ▁mass ▁samba vam ▁x x bo s ▁kerala ▁die ▁hard ▁mohanlal ▁fan s ▁like ▁here ▁x x bo s ▁da ▁ mw one ▁madhura ja ed at re um ▁vari lla ▁ke tto ▁. nte ▁va ka ▁ella r kum ▁chu ka va nda nam ▁x x bo s


In [27]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, pretrained=False)

In [28]:
learn.load('../../dataset_preparation/models/best_model', with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (5391 items)
x: LMTextList
▁x x bo s ▁ho o ▁mam mo kka ▁police ▁vesha m ▁aa ha ▁ant has,▁x x bo s ▁oru ▁re k shay um ▁i lla . . . ki di lam ▁kannu ▁na na nju poy i,▁x x bo s ▁i kka ▁wait ing ▁ xxrep ▁9 ▁.,▁x x bo s ▁raj u ▁etta nte ▁oro ▁short tum ▁i ja thi ▁ pp w li,▁x x bo s ▁et tan ▁fan sil ▁ne tti ▁po ya ▁a ar enkil um ▁und o xxunk ▁ xxunk ▁mad ura raja ▁ xxunk ▁wait ing ▁ xxunk
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (540 items)
x: LMTextList
▁x x bo s ▁speech less ▁ xxunk . ▁i kka aa,▁x x bo s ▁raja ▁so llu nath u ▁mat tu tha am ▁se y y va a ▁se y yu n nath ▁mat tum ▁tha a ▁so l va a,▁x x bo s ▁im ▁pri thi vi raj ▁fan ▁from ▁tamilnadu . . . ▁love ▁it,▁x x bo s ▁mohanlal ▁sir ▁- ▁look ▁ xxrep ▁5 ▁. ▁ki d do . . .,▁x x bo s ▁kanda thil ▁ve ch ▁mu ng iya ▁p dam ▁ra ting ▁1 .1 ▁/ ▁5
y: LMLabelList
,,,,
Path: .;

Test: LabelList (1348 items)
x: LMTextList
▁x x bo s ▁bollywood ▁film ▁new ton ▁inte ▁re ma ke ▁aan o xxunk,▁x 

In [29]:
learn.freeze()

In [30]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.645034,5.009266,0.243973,00:02


In [31]:
learn.save('fit_head', with_opt=True)

In [32]:
learn.load('fit_head', with_opt=True);

In [33]:
learn.unfreeze()

In [34]:
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,4.950925,4.676664,0.272991,00:03
1,4.659377,4.209945,0.324107,00:03
2,4.402082,3.995071,0.346577,00:03
3,4.225903,3.907428,0.35558,00:03
4,4.105953,3.893269,0.356399,00:03


In [35]:
learn.save('fine_tuned', with_opt=True)

In [36]:
learn.load('fine_tuned', with_opt=True);

In [37]:
learn.predict('Evideo oru Hollywood story',n_words=10)

'Evideo oru Hollywood story ho p ▁e nu kku ▁aa thu ▁nigeria i .'

In [38]:
learn.save_encoder('fine_tuned_enc')

In [39]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab, bs=128, label_cols=label_cols, text_cols=text_cols)

In [40]:
data_clas.show_batch()

text,target
▁x x bo s ▁cast ▁ xxunk ▁r ▁mammootty ▁ xxunk ▁r ▁unni ▁muk und an ▁ xxunk ▁r ▁pra chi ▁te h lan ▁ xxunk ▁r ▁siddique ▁ xxunk ▁r ▁achutha n ▁ xxunk ▁r ▁anu si tara ▁ xxunk ▁r ▁in iya ▁ xxunk ▁r ▁kani ha ▁ xxunk ▁r ▁tar un ▁ar ora ▁ xxunk ▁r ▁suresh ▁krishna ▁ xxunk ▁r ▁mani ku ttan ▁ xxunk ▁r,unknown_state
▁x x bo s ▁sh ▁ xxrep ▁5 ▁o ▁ xxrep ▁4 ▁. ▁9 ▁mani kku ▁kaa nan ▁tho da ng iya dha a ▁ xxrep ▁4 ▁. ▁12 : 30 ▁a ayi ▁ xxrep ▁4 ▁. ▁e thra ▁ vattam ▁kand u ▁enna dh inum ▁kana kki laa ▁ xxrep ▁4 ▁. ▁nir tha an ▁pattu nila lo ▁ xxrep ▁5 ▁. ▁oru ▁katta ▁mohanlal ▁prithviraj ▁aaradhak an de,Positive
▁x x bo s ▁all a ▁ xxrep ▁4 ▁. ▁mana s ila ly ▁ xxrep ▁4 ▁o ▁ xxrep ▁7 ▁. ▁vi swa s ichu ▁ko o de varunn a vare ▁je e e vana an ▁ xxrep ▁5 ▁. ▁e etu ▁ xxrep ▁4 ▁. ▁jeevan ▁ xxrep ▁5 ▁. ▁kodu tha yaa alu m ▁ xxrep ▁6 ▁. ▁sam raksh ich ir kku m ▁ xxrep ▁7,unknown_state
▁x x bo s ▁ull ath ▁para ya llo oo ▁si dhi q ▁o zhi ch ▁bha a kki ▁el lla aam ▁chali ▁actor ors ▁ xxrep ▁4 ▁. ▁e ntha a avu mo oo ▁en th ▁ xxrep ▁4 ▁o ▁ xxrep ▁4 ▁. ▁com ment ▁no o ki ii . . . ▁fay ankara m ▁ennu ▁para ya an ▁e e e ▁tra il or il ▁onnu m,Negative
▁x x bo s ▁kuru pu uti varu nav r de ▁an ak il ▁kod tha ▁ke e chan ▁trai ler ▁i kka aa ▁u ff ▁in gal ▁e e e ja thi ▁mass ▁f d f s ▁ura ppa chu u ▁sing a sa ▁tha li va a ▁ xxrep ▁4 ▁. ▁mad ura raja ▁ xxunk ▁raja ef fect ▁ xxunk ▁ri p ▁you tu be ▁ thu,Negative


In [41]:
data_clas.sanity_check()

In [42]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5)

In [43]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (5391 items)
x: TextList
▁x x bo s ▁ho o ▁mam mo kka ▁police ▁vesha m ▁aa ha ▁ant has,▁x x bo s ▁oru ▁re k shay um ▁i lla . . . ki di lam ▁kannu ▁na na nju poy i,▁x x bo s ▁i kka ▁wait ing ▁ xxrep ▁9 ▁.,▁x x bo s ▁raj u ▁etta nte ▁oro ▁short tum ▁i ja thi ▁ pp w li,▁x x bo s ▁et tan ▁fan sil ▁ne tti ▁po ya ▁a ar enkil um ▁und o xxunk ▁ xxunk ▁mad ura raja ▁ xxunk ▁wait ing ▁ xxunk
y: CategoryList
Positive ,Positive ,Positive ,Positive ,Positive 
Path: .;

Valid: LabelList (540 items)
x: TextList
▁x x bo s ▁speech less ▁ xxunk . ▁i kka aa,▁x x bo s ▁raja ▁so llu nath u ▁mat tu tha am ▁se y y va a ▁se y yu n nath ▁mat tum ▁tha a ▁so l va a,▁x x bo s ▁im ▁pri thi vi raj ▁fan ▁from ▁tamilnadu . . . ▁love ▁it,▁x x bo s ▁mohanlal ▁sir ▁- ▁look ▁ xxrep ▁5 ▁. ▁ki d do . . .,▁x x bo s ▁kanda thil ▁ve ch ▁mu ng iya ▁p dam ▁ra ting ▁1 .1 ▁/ ▁5
y: CategoryList
not-malayalam ,not-malayalam ,not-malayalam ,Positive ,Negative 
Path: .;

Test: Label

In [44]:
learn.freeze()

In [45]:
learn.loss_func.func

CrossEntropyLoss()

In [46]:
mcc = MatthewsCorreff()

In [47]:
learn.metrics = [mcc, accuracy]

In [48]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,1.220221,1.039873,0.433369,0.616667,00:02


In [49]:
learn.save('first-full')

In [50]:
learn.load('first-full');

In [51]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,1.044585,0.812918,0.61871,0.731481,00:02


In [52]:
learn.save('second-full')

In [53]:
learn.load('second-full')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (5391 items)
x: TextList
▁x x bo s ▁ho o ▁mam mo kka ▁police ▁vesha m ▁aa ha ▁ant has,▁x x bo s ▁oru ▁re k shay um ▁i lla . . . ki di lam ▁kannu ▁na na nju poy i,▁x x bo s ▁i kka ▁wait ing ▁ xxrep ▁9 ▁.,▁x x bo s ▁raj u ▁etta nte ▁oro ▁short tum ▁i ja thi ▁ pp w li,▁x x bo s ▁et tan ▁fan sil ▁ne tti ▁po ya ▁a ar enkil um ▁und o xxunk ▁ xxunk ▁mad ura raja ▁ xxunk ▁wait ing ▁ xxunk
y: CategoryList
Positive ,Positive ,Positive ,Positive ,Positive 
Path: .;

Valid: LabelList (540 items)
x: TextList
▁x x bo s ▁speech less ▁ xxunk . ▁i kka aa,▁x x bo s ▁raja ▁so llu nath u ▁mat tu tha am ▁se y y va a ▁se y yu n nath ▁mat tum ▁tha a ▁so l va a,▁x x bo s ▁im ▁pri thi vi raj ▁fan ▁from ▁tamilnadu . . . ▁love ▁it,▁x x bo s ▁mohanlal ▁sir ▁- ▁look ▁ xxrep ▁5 ▁. ▁ki d do . . .,▁x x bo s ▁kanda thil ▁ve ch ▁mu ng iya ▁p dam ▁ra ting ▁1 .1 ▁/ ▁5
y: CategoryList
not-malayalam ,not-malayalam ,not-malayalam ,Positive ,Negative 
Path: .;

Test: Label

In [54]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.870312,0.520581,0.739501,0.814815,00:03
1,0.847661,0.503152,0.759501,0.827778,00:03
2,0.689732,0.224443,0.903708,0.931481,00:03
3,0.513973,0.090372,0.971418,0.97963,00:03
4,0.350574,0.076525,0.979178,0.985185,00:03


In [55]:
learn.save('final')

In [56]:
learn.load('final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (5391 items)
x: TextList
▁x x bo s ▁ho o ▁mam mo kka ▁police ▁vesha m ▁aa ha ▁ant has,▁x x bo s ▁oru ▁re k shay um ▁i lla . . . ki di lam ▁kannu ▁na na nju poy i,▁x x bo s ▁i kka ▁wait ing ▁ xxrep ▁9 ▁.,▁x x bo s ▁raj u ▁etta nte ▁oro ▁short tum ▁i ja thi ▁ pp w li,▁x x bo s ▁et tan ▁fan sil ▁ne tti ▁po ya ▁a ar enkil um ▁und o xxunk ▁ xxunk ▁mad ura raja ▁ xxunk ▁wait ing ▁ xxunk
y: CategoryList
Positive ,Positive ,Positive ,Positive ,Positive 
Path: .;

Valid: LabelList (540 items)
x: TextList
▁x x bo s ▁speech less ▁ xxunk . ▁i kka aa,▁x x bo s ▁raja ▁so llu nath u ▁mat tu tha am ▁se y y va a ▁se y yu n nath ▁mat tum ▁tha a ▁so l va a,▁x x bo s ▁im ▁pri thi vi raj ▁fan ▁from ▁tamilnadu . . . ▁love ▁it,▁x x bo s ▁mohanlal ▁sir ▁- ▁look ▁ xxrep ▁5 ▁. ▁ki d do . . .,▁x x bo s ▁kanda thil ▁ve ch ▁mu ng iya ▁p dam ▁ra ting ▁1 .1 ▁/ ▁5
y: CategoryList
not-malayalam ,not-malayalam ,not-malayalam ,Positive ,Negative 
Path: .;

Test: Label

In [57]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test['text']), 'actual_label': list(df_test['category']), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['category']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,query,actual_label,predicted_label,unknown_state,not-malayalam,Mixed_feelings,Negative,Positive
0,speechless 🤐. ikkaaa,not-malayalam,not-malayalam,0.00321969,0.983089,0.000970477,0.000786809,0.0119337
1,Raja sollunathu mattuthaam seyyvaa seyyunnath...,not-malayalam,not-malayalam,4.22216e-07,0.999999,1.17635e-07,3.19529e-08,4.0478e-08
2,Im Prithiviraj fan from tamilnadu... Love it,not-malayalam,not-malayalam,3.41896e-05,0.997794,0.000224901,7.11995e-05,0.00187623
3,mohanlal sir - look ..... kiddo...,Positive,Positive,0.0148137,0.131026,0.00122068,0.00103087,0.851909
4,Kandathil vech mungiya pdam Rating 1.1/5,Negative,Negative,0.0996198,0.00818416,0.00108131,0.856447,0.0346682


In [58]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.9851851851851852

In [59]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

0.9791775467684853

In [60]:
f1_score(df_result['actual_label'], df_result['predicted_label'], average='weighted')

0.9851470733984172

In [61]:
df_test = pd.read_csv(path/'../dc_fire/malayalam_test.tsv', sep='\t')
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'id': list(df_test['id']), 'text': list(df_test['text']), 'category': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['category']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['category'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,id,text,category,unknown_state,not-malayalam,Mixed_feelings,Negative,Positive
0,ml_sen_1,Bollywood film Newton inte remake aano?,Mixed_feelings,0.0315095,0.0116891,0.943262,0.00713557,0.00640415
1,ml_sen_2,endukond viewrs koodunnilla ?? ippozhum 2.8m a...,unknown_state,0.937249,0.00180892,0.00215666,0.00473083,0.054055
2,ml_sen_3,Mara paazhu mega mairananil ninnum ethil koodu...,unknown_state,0.581574,0.0235574,0.157017,0.11001,0.127841
3,ml_sen_4,Video nay cang xem cang thit,unknown_state,0.709561,0.207912,0.00976879,0.0402459,0.032512
4,ml_sen_5,Sunny chechiye kaanan vannathu njan maathram aano,Positive,0.451647,0.00063324,0.00140113,0.00090685,0.545412


In [62]:
Counter(df_result['category'])

Counter({'Mixed_feelings ': 55,
         'Negative ': 146,
         'Positive ': 588,
         'not-malayalam ': 188,
         'unknown_state ': 371})

In [63]:
df_result.to_csv('dc_fire_full.csv', index=False)

In [66]:
df_result.shape

(1348, 8)

In [67]:
df_result[df_result['category']=='unknown_state '].shape

(371, 8)