In [1]:
import pandas as pd
from pathlib import Path
import sentencepiece as spm

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support

import seaborn as sns

## Data 

Data is from Keelreressursside Keskusest: http://peeter.eki.ee:5000/valence/paragraphsquery/

In [2]:
rows=Path('data/raw/exportparagraphs').read_text().split('\n')
len(rows)

4090

In [3]:
def process_row(row):
    pieces=row.split(',', 4)
    if len(pieces)>=5:
        return {'type':pieces[0], 'url':pieces[1], 'number':pieces[2], 'sentiment':pieces[3], 'text':pieces[4]}
    else:
        return {'type':'', 'url':'', 'number':'', 'sentiment':'', 'text':''}

def process_rows(rows):
    processed_rows=[process_row(row) for row in rows]
    return pd.DataFrame(processed_rows)

In [4]:
df=process_rows(rows)
df.shape

(4090, 5)

In [5]:
df.head()

Unnamed: 0,number,sentiment,text,type,url
0,1,negatiivne,"""Enam kui kümme aastat tagasi tegutses huumori...",ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samo...
1,2,vastuoluline,"""Neid ridu kirjutades tundub isegi ebaviisakas...",ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samo...
2,3,positiivne,"""Isiklikult kohtasin natukegi Kukekese moodi p...",ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samo...
3,4,vastuoluline,"""Olen näinud ka, kuidas patrull korrarikkujat ...",ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samo...
4,5,negatiivne,"""Kummaline on nüüd äkki lugeda politsei ja sis...",ARVAMUS,http://arvamus.postimees.ee/1001520/anvar-samo...


In [6]:
df.sentiment.value_counts()

negatiivne      1927
positiivne       882
neutraalne       727
vastuoluline     552
                   2
Name: sentiment, dtype: int64

## Make second dataset, keep only two categories

In [7]:
df_simple=df[df.sentiment.isin(['negatiivne', 'positiivne'])]
df_simple=df_simple.reset_index(drop=True)
df_simple.shape

(2809, 5)

## Remove empty lines

In [8]:
df=df[df.text!='']
df.shape

(4088, 5)

## Tokenizers files

In [9]:
tokenizers_files=list(Path('tokenizers/').glob('*.model'))
len(tokenizers_files)

40

## Classifier

In [10]:
def make_pipe(tokenizer=None):
    vectorizer=CountVectorizer(tokenizer=tokenizer) if tokenizer is not None else CountVectorizer()
    text_clf = Pipeline([
        ('vect', vectorizer),
        ('clf', LinearSVC()) ])
    return text_clf

def cv_model(X, y, tokenizer, n_splits=10):
    skf = StratifiedKFold(n_splits=10)
    precs=[]
    recs=[]
    f1s=[]
    y_tests=[]
    cv_idx=[]
    i=0
    
    for train, test in skf.split(X, y):
        X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
        text_clf=make_pipe(tokenizer)
        text_clf.fit(X_train, y_train)
        pred=text_clf.predict(X_test)
        prec, recall, f1, _ = precision_recall_fscore_support(y_test, pred, average='weighted')
        precs.append(prec)
        recs.append(recall)
        f1s.append(f1)
        cv_idx.append(i)
        i+=1
        
    return pd.DataFrame({'precision':precs, 'recall':recs, 'f1':f1s, 'cv_i':cv_idx})

def tokenizer_metrics(tokenizer_file, X, y):
    if tokenizer_file is not None:
        st = spm.SentencePieceProcessor()
        st.Load(str(tokenizer_file))

        df_tokenizer_metrics=cv_model(X, y, st.EncodeAsPieces)
        df_tokenizer_metrics['tokenizer']=tokenizer_file
    else:
        df_tokenizer_metrics=cv_model(X, y, None)
        df_tokenizer_metrics['tokenizer']='default_sklearn_tokenizer'
    return df_tokenizer_metrics

def tokenizers_metrics(tokenizer_files, X, y):
    df_metrics_all=pd.DataFrame()
    for file in tokenizer_files:
        print(f'working on tokenizer {file}')
        df_tokenizer_metrics=tokenizer_metrics(file, X, y)
        df_metrics_all=df_metrics_all.append(df_tokenizer_metrics)
    df_metrics_all=df_metrics_all.reset_index(drop=True)
    return df_metrics_all

## Run experiment original

In [11]:
#None is for default sklearn tokenizer, this is our baseline
tokenizers_files_with_default=[None]+tokenizers_files

In [12]:
df_metrics_all = tokenizers_metrics(tokenizers_files_with_default, df.text, df.sentiment)

working on tokenizer None




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_identity.model




In [13]:
df_metrics_all.shape

(410, 5)

In [14]:
df_metrics_all.head()

Unnamed: 0,precision,recall,f1,cv_i,tokenizer
0,0.369497,0.374083,0.368613,0,default_sklearn_tokenizer
1,0.419964,0.391198,0.401656,1,default_sklearn_tokenizer
2,0.411638,0.422983,0.410133,2,default_sklearn_tokenizer
3,0.405747,0.430318,0.415472,3,default_sklearn_tokenizer
4,0.40668,0.452323,0.418062,4,default_sklearn_tokenizer


## Run experiment simple

In [15]:
df_metrics_all_simple = tokenizers_metrics(tokenizers_files_with_default, df_simple.text, df_simple.sentiment)

working on tokenizer None
working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_1000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_1000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_5000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_5000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_10000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_10000_norm_identity.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nfkc.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_nfkc_cf.model




working on tokenizer tokenizers/unigram_vocab_size_20000_norm_identity.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nmt_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nfkc.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nmt_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_nfkc_cf.model




working on tokenizer tokenizers/bpe_vocab_size_20000_norm_identity.model




## Visualize results

In [16]:
pd.DataFrame(df_metrics_all.groupby('tokenizer')['f1'].median()).sort_values(by=['f1'], ascending=False).head(10)

Unnamed: 0_level_0,f1
tokenizer,Unnamed: 1_level_1
tokenizers/unigram_vocab_size_1000_norm_nfkc.model,0.421593
tokenizers/bpe_vocab_size_10000_norm_identity.model,0.419265
tokenizers/unigram_vocab_size_1000_norm_nmt_nfkc.model,0.418405
tokenizers/bpe_vocab_size_5000_norm_nfkc.model,0.41793
tokenizers/bpe_vocab_size_5000_norm_nmt_nfkc.model,0.417743
default_sklearn_tokenizer,0.416767
tokenizers/bpe_vocab_size_5000_norm_identity.model,0.41519
tokenizers/bpe_vocab_size_1000_norm_identity.model,0.414394
tokenizers/bpe_vocab_size_1000_norm_nfkc.model,0.413803
tokenizers/unigram_vocab_size_1000_norm_identity.model,0.413796


In [17]:
pd.DataFrame(df_metrics_all_simple.groupby('tokenizer')['f1'].median()).sort_values(by=['f1'], ascending=False).head(10)

Unnamed: 0_level_0,f1
tokenizer,Unnamed: 1_level_1
default_sklearn_tokenizer,0.703863
tokenizers/unigram_vocab_size_20000_norm_identity.model,0.698125
tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc_cf.model,0.696095
tokenizers/unigram_vocab_size_20000_norm_nmt_nfkc.model,0.695539
tokenizers/unigram_vocab_size_20000_norm_nfkc.model,0.695539
tokenizers/unigram_vocab_size_20000_norm_nfkc_cf.model,0.69416
tokenizers/bpe_vocab_size_10000_norm_nfkc_cf.model,0.681869
tokenizers/bpe_vocab_size_10000_norm_nmt_nfkc_cf.model,0.681869
tokenizers/unigram_vocab_size_10000_norm_nfkc.model,0.679319
tokenizers/unigram_vocab_size_10000_norm_nmt_nfkc.model,0.679319
