In [4]:
import numpy as np
import pandas as pd
import os
import itertools
from collections import Counter
from transformers import BertTokenizer

import sys
sys.path.append("..")
import utils.utils as utils
from data_preparation.data_preparation_pos import read_conll

In [5]:
import logging
logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)

### PoS

In [6]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

In [7]:
def pos_stats(info, table, tokenizer):
    file_path = info["file_path"]
    lang_name = info["lang_name"]
    dataset = info["dataset"]
    d = {}
    
    conllu_data = read_conll(file_path)
    examples = [{"id": sent_id, "tokens": tokens, "tags": tags} for sent_id, tokens, tags in zip(conllu_data[0], 
                                                                                                 conllu_data[1],
                                                                                                 conllu_data[2])]
    
    # Number of examples in dataset
    if table["language"].isna().all() or lang_name not in table["language"].values:
        d["language"] = lang_name
        index = table.index[table["language"].isna()][0]
    else:
        index = table.index[table["language"] == lang_name][0]
    d[dataset + "_examples"] = [len(examples)]
    
    # Avg tokens
    tokens, lengths = [], []
    for e in examples:
        tokenized = tokenizer.subword_tokenize(e["tokens"], e["tags"])[0]
        tokens.extend(tokenized)
        lengths.append(len(tokenized))
    d[dataset + "_avg_tokens"] = [np.array(lengths).mean()]
    
    # Hapaxes
    counts = np.array(list(Counter(tokens).items()))
    hapaxes = counts[counts[:,1] == "1"][:,0]
    d[dataset + "_hapaxes"] = [len(hapaxes)]
    d[dataset + "_hapaxes(%)"] = [len(hapaxes) / len(tokens) * 100]
    
    # Unknown
    unk = (np.array(tokens) == "[UNK]").sum()
    d[dataset + "_unknown"] = [unk]
    d[dataset + "_unknown(%)"] = [unk / len(tokens) * 100]
    
    table.update(pd.DataFrame(d, index=[index]))
    return table

In [11]:
names = ["train", "dev", "test"]
names_examples = (np.array(names, dtype=object) + "_examples").tolist()
names_avg = (np.array(names, dtype=object) + "_avg_tokens").tolist()
names_hapaxes = np.array(list(itertools.product(names, ["_hapaxes", "_hapaxes(%)"])), dtype=object)
names_hapaxes = (names_hapaxes[:,0] + names_hapaxes[:,1]).tolist()
names_unknown = np.array(list(itertools.product(names, ["_unknown", "_unknown(%)"])), dtype=object)
names_unknown = (names_unknown[:,0] + names_unknown[:,1]).tolist()
colnames = ["language"] + names_examples + names_avg + names_hapaxes + names_unknown
values = np.empty((len(os.listdir("../data/ud/")), len(colnames)))
values[:] = np.nan

pos_table = utils.run_through_data("../data/ud/", 
                                   pos_stats, 
                                   pd.DataFrame(values, columns=colnames), 
                                   tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [12]:
pos_table = utils.order_table(pos_table)
pos_table = pos_table.astype(dict.fromkeys([col for col in pos_table.columns[1:] if "%" not in col and "avg" not in col], 
                                           pd.Int64Dtype())) # Convert to int
pos_table

Unnamed: 0,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%),train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,Bulgarian,8907.0,1115.0,1116,25.908387,26.656502,26.198029,766.0,0.331938,1014.0,3.411614,928,3.17406,0.0,0.0,0.0,0.0,0,0.0
1,English,4287.0,784.0,1000,22.703056,23.579082,24.394,4158.0,4.272152,2156.0,11.66288,3439,14.097729,800.0,0.821963,117.0,0.632911,116,0.475527
2,Russian,3850.0,579.0,1000,36.337662,38.015544,33.806,2311.0,1.651894,2335.0,10.608332,1965,5.812578,853.0,0.609721,149.0,0.676934,91,0.269183
3,Slovak,8483.0,1060.0,1061,18.328422,23.462264,24.967955,1805.0,1.160921,1863.0,7.490953,1641,6.194557,2106.0,1.354515,59.0,0.237234,37,0.13967
4,Croatian,6914.0,960.0,1136,39.301417,40.905208,37.956866,2646.0,0.973761,1769.0,4.504826,1790,4.151302,21.0,0.007728,1.0,0.002547,0,0.0
5,Chinese,3997.0,500.0,1000,37.077058,38.18,33.63,2260.0,1.524997,1202.0,6.29649,1366,4.06185,126.0,0.085022,14.0,0.073337,195,0.579839
6,Vietnamese,1400.0,800.0,800,20.641429,19.9725,20.7825,344.0,1.190394,372.0,2.328201,390,2.345724,15.0,0.051907,1.0,0.006259,1,0.006015
7,Thai,,,1000,,,69.424,,,,,184,0.265038,,,,,127,0.182934
8,Finnish,12217.0,1364.0,1000,30.006548,29.886364,33.789,3119.0,0.850814,1577.0,3.868515,2033,6.016751,840.0,0.229139,80.0,0.196247,195,0.577111
9,Basque,5396.0,1798.0,1799,25.981282,25.706897,26.115064,2169.0,1.547131,1750.0,3.786158,1767,3.761095,14.0,0.009986,3.0,0.006491,3,0.006386


Export excel

In [24]:
pos_table.to_excel("pos_basic_stats.xlsx", index=False)

Export latex

In [13]:
colnames_percentage = []
colnames_avg = []

for col in pos_table.columns:
    if "%" in col:
        colnames_percentage.append(col)
    elif "avg" in col:
        colnames_avg.append(col)

pos_table[colnames_percentage] = pos_table[colnames_percentage].applymap(lambda x: "{:.2f}".format(x))
pos_table[colnames_avg] = pos_table[colnames_avg].applymap(lambda x: "{:.1f}".format(x))
pos_table = pos_table.replace(np.nan, "-")
pos_table = pos_table.replace("nan", "-")
pos_table

Unnamed: 0,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%),train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,Bulgarian,8907,1115,1116,25.9,26.7,26.2,766,0.33,1014,3.41,928,3.17,0,0.00,0,0.00,0,0.0
1,English,4287,784,1000,22.7,23.6,24.4,4158,4.27,2156,11.66,3439,14.1,800,0.82,117,0.63,116,0.48
2,Russian,3850,579,1000,36.3,38.0,33.8,2311,1.65,2335,10.61,1965,5.81,853,0.61,149,0.68,91,0.27
3,Slovak,8483,1060,1061,18.3,23.5,25.0,1805,1.16,1863,7.49,1641,6.19,2106,1.35,59,0.24,37,0.14
4,Croatian,6914,960,1136,39.3,40.9,38.0,2646,0.97,1769,4.50,1790,4.15,21,0.01,1,0.00,0,0.0
5,Chinese,3997,500,1000,37.1,38.2,33.6,2260,1.52,1202,6.30,1366,4.06,126,0.09,14,0.07,195,0.58
6,Vietnamese,1400,800,800,20.6,20.0,20.8,344,1.19,372,2.33,390,2.35,15,0.05,1,0.01,1,0.01
7,Thai,-,-,1000,-,-,69.4,-,-,-,-,184,0.27,-,-,-,-,127,0.18
8,Finnish,12217,1364,1000,30.0,29.9,33.8,3119,0.85,1577,3.87,2033,6.02,840,0.23,80,0.20,195,0.58
9,Basque,5396,1798,1799,26.0,25.7,26.1,2169,1.55,1750,3.79,1767,3.76,14,0.01,3,0.01,3,0.01


Since the table is too big, we'll split it

In [14]:
utils.convert_table_to_latex(pos_table.iloc[:,:7])

\fusional{Fusional} & Bulgarian & 8907 & 1115 & 1116 & 25.9 & 26.7 & 26.2\\
\fusional{Fusional} & English & 4287 & 784 & 1000 & 22.7 & 23.6 & 24.4\\
\fusional{Fusional} & Russian & 3850 & 579 & 1000 & 36.3 & 38.0 & 33.8\\
\fusional{Fusional} & Slovak & 8483 & 1060 & 1061 & 18.3 & 23.5 & 25.0\\
\fusional{Fusional} & Croatian & 6914 & 960 & 1136 & 39.3 & 40.9 & 38.0\\
\isolating{Isolating} & Chinese & 3997 & 500 & 1000 & 37.1 & 38.2 & 33.6\\
\isolating{Isolating} & Vietnamese & 1400 & 800 & 800 & 20.6 & 20.0 & 20.8\\
\isolating{Isolating} & Thai & - & - & 1000 & - & - & 69.4\\
\agglutinative{Agglutinative} & Finnish & 12217 & 1364 & 1000 & 30.0 & 29.9 & 33.8\\
\agglutinative{Agglutinative} & Basque & 5396 & 1798 & 1799 & 26.0 & 25.7 & 26.1\\
\agglutinative{Agglutinative} & Japanese & 7027 & 501 & 1000 & 35.0 & 35.5 & 41.7\\
\agglutinative{Agglutinative} & Korean & 23010 & 2066 & 1000 & 29.2 & 29.1 & 33.4\\
\agglutinative{Agglutinative} & Turkish & 3664 & 988 & 1000 & 21.8 & 21.4 & 33.8\\

Unnamed: 0,group,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens
0,\fusional{Fusional},Bulgarian,8907,1115,1116,25.9,26.7,26.2
1,\fusional{Fusional},English,4287,784,1000,22.7,23.6,24.4
2,\fusional{Fusional},Russian,3850,579,1000,36.3,38.0,33.8
3,\fusional{Fusional},Slovak,8483,1060,1061,18.3,23.5,25.0
4,\fusional{Fusional},Croatian,6914,960,1136,39.3,40.9,38.0
5,\isolating{Isolating},Chinese,3997,500,1000,37.1,38.2,33.6
6,\isolating{Isolating},Vietnamese,1400,800,800,20.6,20.0,20.8
7,\isolating{Isolating},Thai,-,-,1000,-,-,69.4
8,\agglutinative{Agglutinative},Finnish,12217,1364,1000,30.0,29.9,33.8
9,\agglutinative{Agglutinative},Basque,5396,1798,1799,26.0,25.7,26.1


In [35]:
utils.convert_table_to_latex(pos_table[["language"] + pos_table.columns[7:13].tolist()])

\fusional{Fusional} & Bulgarian & 766 & 0.33 & 1014 & 3.41 & 928 & 3.17\\
\fusional{Fusional} & English & 4158 & 4.27 & 2156 & 11.66 & 3439 & 14.10\\
\fusional{Fusional} & Russian & 2311 & 1.65 & 2335 & 10.61 & 1965 & 5.81\\
\fusional{Fusional} & Slovak & 1805 & 1.16 & 1863 & 7.49 & 1641 & 6.19\\
\fusional{Fusional} & Croatian & 2646 & 0.97 & 1769 & 4.50 & 1790 & 4.15\\
\isolating{Isolating} & Chinese & 2260 & 1.52 & 1202 & 6.30 & 1366 & 4.06\\
\isolating{Isolating} & Vietnamese & 344 & 1.19 & 372 & 2.33 & 390 & 2.35\\
\isolating{Isolating} & Thai & - & - & - & - & 184 & 0.27\\
\agglutinative{Agglutinative} & Finnish & 3119 & 0.85 & 1577 & 3.87 & 2033 & 6.02\\
\agglutinative{Agglutinative} & Basque & 2169 & 1.55 & 1750 & 3.79 & 1767 & 3.76\\
\agglutinative{Agglutinative} & Japanese & 1668 & 0.68 & 1021 & 5.73 & 1027 & 2.47\\
\agglutinative{Agglutinative} & Korean & 1432 & 0.21 & 439 & 0.73 & 721 & 2.16\\
\agglutinative{Agglutinative} & Turkish & 1138 & 1.42 & 1042 & 4.92 & 1730 & 5.13\

Unnamed: 0,group,language,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%)
0,\fusional{Fusional},Bulgarian,766,0.33,1014,3.41,928,3.17
1,\fusional{Fusional},English,4158,4.27,2156,11.66,3439,14.1
2,\fusional{Fusional},Russian,2311,1.65,2335,10.61,1965,5.81
3,\fusional{Fusional},Slovak,1805,1.16,1863,7.49,1641,6.19
4,\fusional{Fusional},Croatian,2646,0.97,1769,4.50,1790,4.15
5,\isolating{Isolating},Chinese,2260,1.52,1202,6.30,1366,4.06
6,\isolating{Isolating},Vietnamese,344,1.19,372,2.33,390,2.35
7,\isolating{Isolating},Thai,-,-,-,-,184,0.27
8,\agglutinative{Agglutinative},Finnish,3119,0.85,1577,3.87,2033,6.02
9,\agglutinative{Agglutinative},Basque,2169,1.55,1750,3.79,1767,3.76


In [36]:
utils.convert_table_to_latex(pos_table[["language"] + pos_table.columns[13:].tolist()])

\fusional{Fusional} & Bulgarian & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\fusional{Fusional} & English & 800 & 0.82 & 117 & 0.63 & 116 & 0.48\\
\fusional{Fusional} & Russian & 853 & 0.61 & 149 & 0.68 & 91 & 0.27\\
\fusional{Fusional} & Slovak & 2106 & 1.35 & 59 & 0.24 & 37 & 0.14\\
\fusional{Fusional} & Croatian & 21 & 0.01 & 1 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Chinese & 126 & 0.09 & 14 & 0.07 & 195 & 0.58\\
\isolating{Isolating} & Vietnamese & 15 & 0.05 & 1 & 0.01 & 1 & 0.01\\
\isolating{Isolating} & Thai & - & - & - & - & 127 & 0.18\\
\agglutinative{Agglutinative} & Finnish & 840 & 0.23 & 80 & 0.20 & 195 & 0.58\\
\agglutinative{Agglutinative} & Basque & 14 & 0.01 & 3 & 0.01 & 3 & 0.01\\
\agglutinative{Agglutinative} & Japanese & 548 & 0.22 & 42 & 0.24 & 4 & 0.01\\
\agglutinative{Agglutinative} & Korean & 1552 & 0.23 & 77 & 0.13 & 92 & 0.28\\
\agglutinative{Agglutinative} & Turkish & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\introflexive{Introflexive} & Arabic & 0 & 0.00 & 0 & 0.00 & 0 & 

Unnamed: 0,group,language,train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,\fusional{Fusional},Bulgarian,0,0.00,0,0.00,0,0.0
1,\fusional{Fusional},English,800,0.82,117,0.63,116,0.48
2,\fusional{Fusional},Russian,853,0.61,149,0.68,91,0.27
3,\fusional{Fusional},Slovak,2106,1.35,59,0.24,37,0.14
4,\fusional{Fusional},Croatian,21,0.01,1,0.00,0,0.0
5,\isolating{Isolating},Chinese,126,0.09,14,0.07,195,0.58
6,\isolating{Isolating},Vietnamese,15,0.05,1,0.01,1,0.01
7,\isolating{Isolating},Thai,-,-,-,-,127,0.18
8,\agglutinative{Agglutinative},Finnish,840,0.23,80,0.20,195,0.58
9,\agglutinative{Agglutinative},Basque,14,0.01,3,0.01,3,0.01


### Sentiment

In [15]:
model_name = "bert-base-multilingual-cased"
tokenizer = ABSATokenizer.from_pretrained(model_name)

In [16]:
def sentiment_stats(info, table, tokenizer):
    file_path = info["file_path"]
    lang_name = info["lang_name"]
    dataset = info["dataset"]
    d = {}
    
    data = pd.read_csv(file_path, header=None)
    data.columns = ["sentiment", "review"]
    
    # Number of examples in dataset
    if table["language"].isna().all() or lang_name not in table["language"].values:
        d["language"] = lang_name
        index = table.index[table["language"].isna()][0]
    else:
        index = table.index[table["language"] == lang_name][0]
    d[dataset + "_examples"] = [data.shape[0]]
    
    # Avg tokens
    tokens, lengths = [], []
    for e in data["review"]:
        tokenized = tokenizer.encode(e)
        tokens.extend(tokenized)
        lengths.append(len(tokenized))
    d[dataset + "_avg_tokens"] = [np.array(lengths).mean()]
    
    # Hapaxes
    counts = np.array(list(Counter(tokens).items()))
    hapaxes = counts[counts[:,1] == 1][:,0]
    d[dataset + "_hapaxes"] = [len(hapaxes)]
    d[dataset + "_hapaxes(%)"] = [len(hapaxes) / len(tokens) * 100]
    
    # Unknown
    unk = (np.array(tokens) == 100).sum()
    d[dataset + "_unknown"] = [unk]
    d[dataset + "_unknown(%)"] = [unk / len(tokens) * 100]
    
    table.update(pd.DataFrame(d, index=[index]))
    return table

In [17]:
names = ["train", "dev", "test"]
names_examples = (np.array(names, dtype=object) + "_examples").tolist()
names_avg = (np.array(names, dtype=object) + "_avg_tokens").tolist()
names_hapaxes = np.array(list(itertools.product(names, ["_hapaxes", "_hapaxes(%)"])), dtype=object)
names_hapaxes = (names_hapaxes[:,0] + names_hapaxes[:,1]).tolist()
names_unknown = np.array(list(itertools.product(names, ["_unknown", "_unknown(%)"])), dtype=object)
names_unknown = (names_unknown[:,0] + names_unknown[:,1]).tolist()
colnames = ["language"] + names_examples + names_avg + names_hapaxes + names_unknown
values = np.empty((len(os.listdir("../data/ud/")), len(colnames)))
values[:] = np.nan

sentiment_table = utils.run_through_data("../data/sentiment/", 
                                   sentiment_stats, 
                                   pd.DataFrame(values, columns=colnames), 
                                   tokenizer=tokenizer)

HBox(children=(FloatProgress(value=0.0, max=46.0), HTML(value='')))

../data/sentiment\ko\kosac-corpus-130808.csv is not a valid data path, skipping



In [18]:
sentiment_table = utils.order_table(sentiment_table)
sentiment_table = sentiment_table.astype(
    dict.fromkeys([col for col in sentiment_table.columns[1:] if "%" not in col and "avg" not in col], 
                  pd.Int64Dtype())
) # Convert to int
sentiment_table

Unnamed: 0,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%),train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,Bulgarian,5412,838,1673,19.942905,18.220764,19.430962,599,0.554984,471,3.084681,556,1.710348,29,0.026869,3,0.019648,10,0.030762
1,English,6920,872,1821,27.659682,27.959862,27.584843,3257,1.701627,2416,9.909356,2876,5.725434,626,0.327055,52,0.213281,99,0.197086
2,Russian,2938,424,867,1120.134786,1122.966981,941.967705,2005,0.060925,1838,0.386023,1879,0.230076,12547,0.381257,1321,0.277441,3288,0.402603
3,Slovak,3724,532,1064,32.97261,32.325188,34.880639,1460,1.189022,990,5.756818,1143,3.079783,142,0.115645,28,0.162819,51,0.137418
4,Croatian,1507,214,437,55.202389,56.742991,51.778032,1409,1.693713,946,7.790497,1055,4.662571,2,0.002404,0,0.0,0,0.0
5,Chinese,19292,2756,5513,80.298258,79.469521,80.164339,606,0.039119,856,0.390835,935,0.211564,6772,0.437153,1133,0.517309,1734,0.392356
6,Vietnamese,2384,331,685,35.752936,40.951662,37.435036,1026,1.203731,732,5.400221,797,3.108061,384,0.450519,51,0.376245,60,0.233982
7,Thai,8103,1153,2344,52.782179,56.319167,56.366894,1445,0.337858,597,0.919367,894,0.676637,1384,0.323596,193,0.297216,390,0.295177
8,Finnish,1355,199,397,93.261255,95.316583,92.488665,1704,1.348432,1215,6.405525,1320,3.594967,15,0.01187,2,0.010544,2,0.005447
9,Basque,789,113,227,21.449937,20.946903,20.682819,851,5.028362,461,19.47613,607,12.928647,0,0.0,0,0.0,0,0.0


Export excel

In [14]:
sentiment_table.to_excel("sentiment_basic_stats.xlsx", index=False)

Export latex (split into several tables)

In [19]:
colnames_percentage = []
colnames_avg = []

for col in sentiment_table.columns:
    if "%" in col:
        colnames_percentage.append(col)
    elif "avg" in col:
        colnames_avg.append(col)

sentiment_table[colnames_percentage] = sentiment_table[colnames_percentage].applymap(lambda x: "{:.2f}".format(x))
sentiment_table[colnames_avg] = sentiment_table[colnames_avg].applymap(lambda x: "{:.1f}".format(x))
sentiment_table = sentiment_table.replace(np.nan, "-")
sentiment_table = sentiment_table.replace("nan", "-")
sentiment_table

Unnamed: 0,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%),train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,Bulgarian,5412,838,1673,19.9,18.2,19.4,599,0.55,471,3.08,556,1.71,29,0.03,3,0.02,10,0.03
1,English,6920,872,1821,27.7,28.0,27.6,3257,1.7,2416,9.91,2876,5.73,626,0.33,52,0.21,99,0.2
2,Russian,2938,424,867,1120.1,1123.0,942.0,2005,0.06,1838,0.39,1879,0.23,12547,0.38,1321,0.28,3288,0.4
3,Slovak,3724,532,1064,33.0,32.3,34.9,1460,1.19,990,5.76,1143,3.08,142,0.12,28,0.16,51,0.14
4,Croatian,1507,214,437,55.2,56.7,51.8,1409,1.69,946,7.79,1055,4.66,2,0.0,0,0.0,0,0.0
5,Chinese,19292,2756,5513,80.3,79.5,80.2,606,0.04,856,0.39,935,0.21,6772,0.44,1133,0.52,1734,0.39
6,Vietnamese,2384,331,685,35.8,41.0,37.4,1026,1.2,732,5.4,797,3.11,384,0.45,51,0.38,60,0.23
7,Thai,8103,1153,2344,52.8,56.3,56.4,1445,0.34,597,0.92,894,0.68,1384,0.32,193,0.3,390,0.3
8,Finnish,1355,199,397,93.3,95.3,92.5,1704,1.35,1215,6.41,1320,3.59,15,0.01,2,0.01,2,0.01
9,Basque,789,113,227,21.4,20.9,20.7,851,5.03,461,19.48,607,12.93,0,0.0,0,0.0,0,0.0


In [26]:
utils.convert_table_to_latex(sentiment_table.iloc[:,:7])

\fusional{Fusional} & Bulgarian & 5412 & 838 & 1673 & 19.9 & 18.2 & 19.4\\
\fusional{Fusional} & English & 6920 & 872 & 1821 & 27.7 & 28.0 & 27.6\\
\fusional{Fusional} & Russian & 2938 & 424 & 867 & 1120.1 & 1123.0 & 942.0\\
\fusional{Fusional} & Slovak & 3724 & 532 & 1064 & 33.0 & 32.3 & 34.9\\
\fusional{Fusional} & Croatian & 1507 & 214 & 437 & 55.2 & 56.7 & 51.8\\
\isolating{Isolating} & Chinese & 19292 & 2756 & 5513 & 80.3 & 79.5 & 80.2\\
\isolating{Isolating} & Vietnamese & 2384 & 331 & 685 & 35.8 & 41.0 & 37.4\\
\isolating{Isolating} & Thai & 8103 & 1153 & 2344 & 52.8 & 56.3 & 56.4\\
\agglutinative{Agglutinative} & Finnish & 1355 & 199 & 397 & 93.3 & 95.3 & 92.5\\
\agglutinative{Agglutinative} & Basque & 789 & 113 & 227 & 21.4 & 20.9 & 20.7\\
\agglutinative{Agglutinative} & Japanese & 10000 & 1000 & 2000 & 532.8 & 541.2 & 526.8\\
\agglutinative{Agglutinative} & Korean & 3237 & 463 & 926 & 44.2 & 39.2 & 43.0\\
\agglutinative{Agglutinative} & Turkish & 660 & 91 & 184 & 1217.5 & 128

Unnamed: 0,group,language,train_examples,dev_examples,test_examples,train_avg_tokens,dev_avg_tokens,test_avg_tokens
0,\fusional{Fusional},Bulgarian,5412,838,1673,19.9,18.2,19.4
1,\fusional{Fusional},English,6920,872,1821,27.7,28.0,27.6
2,\fusional{Fusional},Russian,2938,424,867,1120.1,1123.0,942.0
3,\fusional{Fusional},Slovak,3724,532,1064,33.0,32.3,34.9
4,\fusional{Fusional},Croatian,1507,214,437,55.2,56.7,51.8
5,\isolating{Isolating},Chinese,19292,2756,5513,80.3,79.5,80.2
6,\isolating{Isolating},Vietnamese,2384,331,685,35.8,41.0,37.4
7,\isolating{Isolating},Thai,8103,1153,2344,52.8,56.3,56.4
8,\agglutinative{Agglutinative},Finnish,1355,199,397,93.3,95.3,92.5
9,\agglutinative{Agglutinative},Basque,789,113,227,21.4,20.9,20.7


In [20]:
utils.convert_table_to_latex(sentiment_table[["language"] + sentiment_table.columns[7:13].tolist()])

\fusional{Fusional} & Bulgarian & 599 & 0.55 & 471 & 3.08 & 556 & 1.71\\
\fusional{Fusional} & English & 3257 & 1.70 & 2416 & 9.91 & 2876 & 5.73\\
\fusional{Fusional} & Russian & 2005 & 0.06 & 1838 & 0.39 & 1879 & 0.23\\
\fusional{Fusional} & Slovak & 1460 & 1.19 & 990 & 5.76 & 1143 & 3.08\\
\fusional{Fusional} & Croatian & 1409 & 1.69 & 946 & 7.79 & 1055 & 4.66\\
\isolating{Isolating} & Chinese & 606 & 0.04 & 856 & 0.39 & 935 & 0.21\\
\isolating{Isolating} & Vietnamese & 1026 & 1.20 & 732 & 5.40 & 797 & 3.11\\
\isolating{Isolating} & Thai & 1445 & 0.34 & 597 & 0.92 & 894 & 0.68\\
\agglutinative{Agglutinative} & Finnish & 1704 & 1.35 & 1215 & 6.41 & 1320 & 3.59\\
\agglutinative{Agglutinative} & Basque & 851 & 5.03 & 461 & 19.48 & 607 & 12.93\\
\agglutinative{Agglutinative} & Japanese & 3413 & 0.06 & 1928 & 0.36 & 2288 & 0.22\\
\agglutinative{Agglutinative} & Korean & 445 & 0.31 & 507 & 2.79 & 448 & 1.12\\
\agglutinative{Agglutinative} & Turkish & 2851 & 0.35 & 1894 & 1.62 & 2049 & 0.90

Unnamed: 0,group,language,train_hapaxes,train_hapaxes(%),dev_hapaxes,dev_hapaxes(%),test_hapaxes,test_hapaxes(%)
0,\fusional{Fusional},Bulgarian,599,0.55,471,3.08,556,1.71
1,\fusional{Fusional},English,3257,1.7,2416,9.91,2876,5.73
2,\fusional{Fusional},Russian,2005,0.06,1838,0.39,1879,0.23
3,\fusional{Fusional},Slovak,1460,1.19,990,5.76,1143,3.08
4,\fusional{Fusional},Croatian,1409,1.69,946,7.79,1055,4.66
5,\isolating{Isolating},Chinese,606,0.04,856,0.39,935,0.21
6,\isolating{Isolating},Vietnamese,1026,1.2,732,5.4,797,3.11
7,\isolating{Isolating},Thai,1445,0.34,597,0.92,894,0.68
8,\agglutinative{Agglutinative},Finnish,1704,1.35,1215,6.41,1320,3.59
9,\agglutinative{Agglutinative},Basque,851,5.03,461,19.48,607,12.93


In [21]:
utils.convert_table_to_latex(sentiment_table[["language"] + sentiment_table.columns[13:].tolist()])

\fusional{Fusional} & Bulgarian & 29 & 0.03 & 3 & 0.02 & 10 & 0.03\\
\fusional{Fusional} & English & 626 & 0.33 & 52 & 0.21 & 99 & 0.20\\
\fusional{Fusional} & Russian & 12547 & 0.38 & 1321 & 0.28 & 3288 & 0.40\\
\fusional{Fusional} & Slovak & 142 & 0.12 & 28 & 0.16 & 51 & 0.14\\
\fusional{Fusional} & Croatian & 2 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\isolating{Isolating} & Chinese & 6772 & 0.44 & 1133 & 0.52 & 1734 & 0.39\\
\isolating{Isolating} & Vietnamese & 384 & 0.45 & 51 & 0.38 & 60 & 0.23\\
\isolating{Isolating} & Thai & 1384 & 0.32 & 193 & 0.30 & 390 & 0.30\\
\agglutinative{Agglutinative} & Finnish & 15 & 0.01 & 2 & 0.01 & 2 & 0.01\\
\agglutinative{Agglutinative} & Basque & 0 & 0.00 & 0 & 0.00 & 0 & 0.00\\
\agglutinative{Agglutinative} & Japanese & 16407 & 0.31 & 1539 & 0.28 & 2825 & 0.27\\
\agglutinative{Agglutinative} & Korean & 608 & 0.43 & 42 & 0.23 & 184 & 0.46\\
\agglutinative{Agglutinative} & Turkish & 2272 & 0.28 & 217 & 0.19 & 695 & 0.31\\
\introflexive{Introflexive} & Arabi

Unnamed: 0,group,language,train_unknown,train_unknown(%),dev_unknown,dev_unknown(%),test_unknown,test_unknown(%)
0,\fusional{Fusional},Bulgarian,29,0.03,3,0.02,10,0.03
1,\fusional{Fusional},English,626,0.33,52,0.21,99,0.2
2,\fusional{Fusional},Russian,12547,0.38,1321,0.28,3288,0.4
3,\fusional{Fusional},Slovak,142,0.12,28,0.16,51,0.14
4,\fusional{Fusional},Croatian,2,0.0,0,0.0,0,0.0
5,\isolating{Isolating},Chinese,6772,0.44,1133,0.52,1734,0.39
6,\isolating{Isolating},Vietnamese,384,0.45,51,0.38,60,0.23
7,\isolating{Isolating},Thai,1384,0.32,193,0.3,390,0.3
8,\agglutinative{Agglutinative},Finnish,15,0.01,2,0.01,2,0.01
9,\agglutinative{Agglutinative},Basque,0,0.0,0,0.0,0,0.0
