In [1]:
import pandas as pd
from sacremoses import MosesTokenizer

Create a Moses tokenizer

In [2]:
mtk = MosesTokenizer()

## Import cleaned data

In [3]:
train = pd.read_csv('train_clean.csv')
val = pd.read_csv('val_clean.csv')

train

Unnamed: 0,review_id,review,rating,review_clean,count
0,0,Ga disappointed neat products .. Meletot Hilsn...,1,ga disappointed neat products . meletot hilsny...,11
1,1,"Rdtanya replace broken glass, broken chargernya",1,rdtanya replace broken glass. broken chargernya,6
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1,nyesel bngt dsni shopping antecedent photo mes...,42
3,3,Sent a light blue suit goods ga want a refund,1,sent light blue suit goods ga want refund,8
4,4,Pendants came with dents and scratches on its ...,1,pendants came dents scratches surface. coating...,11
...,...,...,...,...,...
111873,146802,Product is good. But next time don’t put a sti...,5,product good. next time put sticker. ruined na...,12
111874,146803,Thanks you very satisfactory,5,thanks satisfactory,2
111875,146806,Excellent product quality delivery speed is ve...,5,excellent product quality delivery speed good ...,10
111876,146807,thanks gan,5,thanks gan,2


Apply to Moses tokenizer and combine them back

In [4]:
train['review_clean'] = train['review_clean'].apply(mtk.tokenize)
train

Unnamed: 0,review_id,review,rating,review_clean,count
0,0,Ga disappointed neat products .. Meletot Hilsn...,1,"[ga, disappointed, neat, products, ., meletot,...",11
1,1,"Rdtanya replace broken glass, broken chargernya",1,"[rdtanya, replace, broken, glass., broken, cha...",6
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1,"[nyesel, bngt, dsni, shopping, antecedent, pho...",42
3,3,Sent a light blue suit goods ga want a refund,1,"[sent, light, blue, suit, goods, ga, want, ref...",8
4,4,Pendants came with dents and scratches on its ...,1,"[pendants, came, dents, scratches, surface., c...",11
...,...,...,...,...,...
111873,146802,Product is good. But next time don’t put a sti...,5,"[product, good., next, time, put, sticker., ru...",12
111874,146803,Thanks you very satisfactory,5,"[thanks, satisfactory]",2
111875,146806,Excellent product quality delivery speed is ve...,5,"[excellent, product, quality, delivery, speed,...",10
111876,146807,thanks gan,5,"[thanks, gan]",2


In [5]:
train['review_clean'] = train['review_clean'].str.join(' ')
train

Unnamed: 0,review_id,review,rating,review_clean,count
0,0,Ga disappointed neat products .. Meletot Hilsn...,1,ga disappointed neat products . meletot hilsny...,11
1,1,"Rdtanya replace broken glass, broken chargernya",1,rdtanya replace broken glass. broken chargernya,6
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1,nyesel bngt dsni shopping antecedent photo mes...,42
3,3,Sent a light blue suit goods ga want a refund,1,sent light blue suit goods ga want refund,8
4,4,Pendants came with dents and scratches on its ...,1,pendants came dents scratches surface. coating...,11
...,...,...,...,...,...
111873,146802,Product is good. But next time don’t put a sti...,5,product good. next time put sticker. ruined na...,12
111874,146803,Thanks you very satisfactory,5,thanks satisfactory,2
111875,146806,Excellent product quality delivery speed is ve...,5,excellent product quality delivery speed good ...,10
111876,146807,thanks gan,5,thanks gan,2


Do the same thing for validation set

In [6]:
val['review'] = val['review_clean'].apply(mtk.tokenize).str.join(' ')
val

Unnamed: 0,review_id,review,rating,review_clean,count
0,8164,quality ugly. thin material,1,quality ugly. thin material,4
1,4416,poor product aand poor quality .,1,poor product aand poor quality.,5
2,11894,control lights miring2. somewhat disappointed....,1,control lights miring2. somewhat disappointed....,12
3,9314,buy usable,1,buy usable,2
4,7670,wrong item delivered frowning _ face ️,1,wrong item delivered frowning_face ️,5
...,...,...,...,...,...
4995,120696,product quality excellent product quality exce...,5,product quality excellent product quality exce...,9
4996,110609,nice pants alus material appropriate size,5,nice pants alus material appropriate size,6
4997,143247,products already received appropriate orders. ...,5,products already received appropriate orders. ...,8
4998,115772,suitable children sgttt pompuan,5,suitable children sgttt pompuan,4


Check train class distribution

In [7]:
train.groupby('rating').count()

Unnamed: 0_level_0,review_id,review,review_clean,count
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,12803,12803,12803,12803
2,10260,10260,10260,10260
3,27478,27478,27478,27478
4,30931,30931,30931,30931
5,30406,30406,30406,30406


## Prepare FastText input format

```
"But I have put a call saying I'm not hearing it." __label__1
"Excellent product quality, use makeup remover cotton soft, clean skin" __label__5
```

In [8]:
def prepare_fasttext(input, label): return '__label__' + label + ' ' + input

train_fasttext = prepare_fasttext(train['review_clean'], train['rating'].astype('str'))
val_fasttext = prepare_fasttext(val['review_clean'], val['rating'].astype('str'))

In [9]:
train_fasttext = train_fasttext.sample(len(train_fasttext))
train_fasttext

49180    __label__3 rumayannnnnnnnnnnnn goods quick pen...
85428    __label__5 speed ​ ​ of delivery good. good se...
93558    __label__5 thank much. already package safely . ♡
29422    __label__3 hemisphere seem jd hrs wear straple...
3768      __label__1 gasesuai photographed. heat materials
                               ...                        
51487    __label__4 huhuhuhuhu mayan prophecies seginda...
26586    __label__3 banget. lumayan beautiful color dur...
14089    __label__2 send reply. bought another serum ga...
24773    __label__3 sorry love 4. nice. fingernails big...
14730    __label__2 issue mask quantity previously solv...
Length: 111878, dtype: object

Save to file for use with FastText in IPython shell

In [10]:
train_fasttext.to_csv('train_fasttext.csv', index=False, header=False)
val_fasttext.to_csv('val_fasttext.csv', index=False, header=False)

## Load saved FastText model for interference

In [11]:
import fasttext

In [12]:
model = fasttext.load_model('fasttext.bin')



In [13]:
model.test('val_fasttext.csv')

(4993, 0.4302022831964751, 0.4302022831964751)

Perform interference on test set

In [14]:
test = pd.read_csv('test_clean.csv')
test['review_clean'] = test['review_clean'].apply(mtk.tokenize).str.join(' ')
test

Unnamed: 0,review_id,review,review_clean
0,1,"Great danger, cool, motif and cantik2 jg model...",great danger. cool. motif cantik2 jg models. d...
1,2,One of the shades don't fit well,one shades fit well
2,3,Very comfortable,comfortable
3,4,Fast delivery. Product expiry is on Dec 2022. ...,fast delivery. product expiry dec 2022. produc...
4,5,it's sooooo cute! i like playing with the glit...,sooooo cute. like playing glitters better brow...
...,...,...,...
60422,60423,Product has been succesfully ordered and shipp...,product succesfully ordered shipped quickly. g...
60423,60424,Opening time a little scared. Fear dalemnya de...,opening time little scared. fear dalemnya dest...
60424,60425,The product quality is excellent. The origina...,product quality excellent. original product. p...
60425,60426,They 're holding up REALLY well also .,&apos;re holding really well also .


In [15]:
test['fasttext'] = test['review_clean'].apply(model.predict)
test

Unnamed: 0,review_id,review,review_clean,fasttext
0,1,"Great danger, cool, motif and cantik2 jg model...",great danger. cool. motif cantik2 jg models. d...,"((__label__3,), [0.5670856237411499])"
1,2,One of the shades don't fit well,one shades fit well,"((__label__5,), [0.46503695845603943])"
2,3,Very comfortable,comfortable,"((__label__5,), [0.3944011628627777])"
3,4,Fast delivery. Product expiry is on Dec 2022. ...,fast delivery. product expiry dec 2022. produc...,"((__label__4,), [0.36109068989753723])"
4,5,it's sooooo cute! i like playing with the glit...,sooooo cute. like playing glitters better brow...,"((__label__4,), [0.40116262435913086])"
...,...,...,...,...
60422,60423,Product has been succesfully ordered and shipp...,product succesfully ordered shipped quickly. g...,"((__label__5,), [0.36166679859161377])"
60423,60424,Opening time a little scared. Fear dalemnya de...,opening time little scared. fear dalemnya dest...,"((__label__3,), [0.5584401488304138])"
60424,60425,The product quality is excellent. The origina...,product quality excellent. original product. p...,"((__label__4,), [0.5016987323760986])"
60425,60426,They 're holding up REALLY well also .,&apos;re holding really well also .,"((__label__5,), [0.43777817487716675])"


In [16]:
test['rating'] = test['fasttext'].str[0].str[0].str[-1]
test

Unnamed: 0,review_id,review,review_clean,fasttext,rating
0,1,"Great danger, cool, motif and cantik2 jg model...",great danger. cool. motif cantik2 jg models. d...,"((__label__3,), [0.5670856237411499])",3
1,2,One of the shades don't fit well,one shades fit well,"((__label__5,), [0.46503695845603943])",5
2,3,Very comfortable,comfortable,"((__label__5,), [0.3944011628627777])",5
3,4,Fast delivery. Product expiry is on Dec 2022. ...,fast delivery. product expiry dec 2022. produc...,"((__label__4,), [0.36109068989753723])",4
4,5,it's sooooo cute! i like playing with the glit...,sooooo cute. like playing glitters better brow...,"((__label__4,), [0.40116262435913086])",4
...,...,...,...,...,...
60422,60423,Product has been succesfully ordered and shipp...,product succesfully ordered shipped quickly. g...,"((__label__5,), [0.36166679859161377])",5
60423,60424,Opening time a little scared. Fear dalemnya de...,opening time little scared. fear dalemnya dest...,"((__label__3,), [0.5584401488304138])",3
60424,60425,The product quality is excellent. The origina...,product quality excellent. original product. p...,"((__label__4,), [0.5016987323760986])",4
60425,60426,They 're holding up REALLY well also .,&apos;re holding really well also .,"((__label__5,), [0.43777817487716675])",5


In [17]:
test.groupby('rating').count()

Unnamed: 0_level_0,review_id,review,review_clean,fasttext
rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5991,5991,5991,5991
2,2507,2507,2507,2507
3,14596,14596,14596,14596
4,20654,20654,20654,20654
5,16679,16679,16679,16679


In [18]:
test.loc[:, ['review_id', 'rating']].to_csv('fasttext_submission.csv', index=False)