# Generate frequency statistics for training datasets, transform to Latex for inclusion in report

In [5]:
import numpy as np
## Statistics on files
def line_count(file_name):
    input_freqs = []
    input_words = []
    i = 0
    with open(file_name, 'r') as f:
        for input_sentence in f:
            input_freqs.append(len(input_sentence))
            input_words.append(len(input_sentence.split(" ")))
    print("Done reading file {}".format(file_name))
    print("{} total lengths: avg: {} max: {} min: {} stddev: {}".format(file_name, np.mean(input_freqs), 
                                                                           np.max(input_freqs), 
                                                                           np.min(input_freqs),
                                                                           np.std(input_freqs)
                                                                          ))
    print("{} total words: avg: {} max: {} min: {} stddev: {}".format(file_name, np.mean(input_words), 
                                                                           np.max(input_words), 
                                                                           np.min(input_words),
                                                                           np.std(input_words)
                                                                          ))
    return np.array([np.mean(input_freqs), np.max(input_freqs), np.min(input_freqs),np.std(input_freqs),np.mean(input_words), np.max(input_words),np.min(input_words),np.std(input_words)])



line_count("train.fr")

Done reading file train.fr
train.fr total lengths: avg: 109.56542814548213 max: 3800 min: 4 stddev: 82.87469182093615
train.fr total words: avg: 21.349564665096462 max: 921 min: 3 stddev: 15.81507534011397


array([  1.09565428e+02,   3.80000000e+03,   4.00000000e+00,
         8.28746918e+01,   2.13495647e+01,   9.21000000e+02,
         3.00000000e+00,   1.58150753e+01])

In [19]:
import pandas as pd
overall_results = []

f_list = ["train.fr", 
          "train.en", 
          "dev.fr", 
          "dev.en", 
          "tst2012.fr", 
          "tst2012.en",  
          "valid.fr", 
          "valid.en",
          "tst2013.fr", 
          "tst2013.en",  
          "tst2014.fr", 
          "tst2014.en",  
          "train.fr.tok.bpe", 
          "train.en.tok.bpe", 
          "dev.fr.tok.bpe", 
          "dev.en.tok.bpe", 
          "tst2012.fr.tok.bpe", 
          "tst2012.en.tok.bpe",  
          "valid.fr.tok.bpe", 
          "valid.en.tok.bpe",
          "tst2013.fr.tok.bpe", 
          "tst2013.en.tok.bpe",  
          "tst2014.fr.tok.bpe", 
          "tst2014.en.tok.bpe",  
          "train.fr.norm.char", 
          "dev.fr.norm.char", 
          "tst2012.fr.norm.char", 
          "valid.fr.norm.char", 
          "tst2013.fr.norm.char", 
          "tst2014.fr.norm.char" 
         ]
for f in f_list:
    overall_results.append(line_count(f))
    
total_results = pd.DataFrame(overall_results, 
                             columns=["avg_char_length", 
                                      "max_char_length", 
                                      "min_char_length", 
                                      "std_char_length", "avg_word_length", 
                                      "max_word_length", "min_word_length", "std_word_length"])
total_results.index = f_list
total_results

Done reading file train.fr
train.fr total lengths: avg: 109.56542814548213 max: 3800 min: 4 stddev: 82.87469182093615
train.fr total words: avg: 21.349564665096462 max: 921 min: 3 stddev: 15.81507534011397
Done reading file train.en
train.en total lengths: avg: 95.75961830088853 max: 2688 min: 2 stddev: 71.76928737732645
train.en total words: avg: 17.35814901766959 max: 624 min: 1 stddev: 13.25205952317686
Done reading file dev.fr
dev.fr total lengths: avg: 114.47576099210823 max: 868 min: 6 stddev: 87.31308739148095
dev.fr total words: avg: 19.143179255918827 max: 153 min: 1 stddev: 14.503695296906226
Done reading file dev.en
dev.en total lengths: avg: 108.76888387824127 max: 874 min: 5 stddev: 82.09210339554114
dev.en total words: avg: 19.677564825253665 max: 158 min: 1 stddev: 14.72099726491149
Done reading file tst2012.fr
tst2012.fr total lengths: avg: 106.83096085409252 max: 511 min: 5 stddev: 71.6747912185992
tst2012.fr total words: avg: 18.846975088967973 max: 92 min: 2 stddev: 

Unnamed: 0,avg_char_length,max_char_length,min_char_length,std_char_length,avg_word_length,max_word_length,min_word_length,std_word_length
train.fr,109.565428,3800.0,4.0,82.874692,21.349565,921.0,3.0,15.815075
train.en,95.759618,2688.0,2.0,71.769287,17.358149,624.0,1.0,13.25206
dev.fr,114.475761,868.0,6.0,87.313087,19.143179,153.0,1.0,14.503695
dev.en,108.768884,874.0,5.0,82.092103,19.677565,158.0,1.0,14.720997
tst2012.fr,106.830961,511.0,5.0,71.674791,18.846975,92.0,2.0,12.157174
tst2012.en,92.457295,448.0,5.0,60.573335,17.55694,85.0,2.0,10.852266
valid.fr,108.532889,2199.0,4.0,82.515385,18.373518,473.0,1.0,14.449142
valid.en,97.35947,1894.0,4.0,73.979112,17.595764,439.0,1.0,13.77384
tst2013.fr,111.419103,680.0,4.0,80.858381,19.053606,110.0,1.0,13.645185
tst2013.en,99.222222,593.0,8.0,71.918864,18.04191,100.0,1.0,12.834727


## LaTeX Transformation

In [20]:
print(total_results.to_latex())

\begin{tabular}{lrrrrrrrr}
\toprule
{} &  avg\_char\_length &  max\_char\_length &  min\_char\_length &  std\_char\_length &  avg\_word\_length &  max\_word\_length &  min\_word\_length &  std\_word\_length \\
\midrule
train.fr             &       109.565428 &           3800.0 &              4.0 &        82.874692 &        21.349565 &            921.0 &              3.0 &        15.815075 \\
train.en             &        95.759618 &           2688.0 &              2.0 &        71.769287 &        17.358149 &            624.0 &              1.0 &        13.252060 \\
dev.fr               &       114.475761 &            868.0 &              6.0 &        87.313087 &        19.143179 &            153.0 &              1.0 &        14.503695 \\
dev.en               &       108.768884 &            874.0 &              5.0 &        82.092103 &        19.677565 &            158.0 &              1.0 &        14.720997 \\
tst2012.fr           &       106.830961 &            511.0 &              5.0

## Most frequent word examples

In [26]:
# most frequent words: BPE training, char training
from collections import Counter
def word_freq(file_name):
    input_words = Counter()
    i = 0
    with open(file_name, 'r') as f:
        for input_sentence in f:
            input_words.update(input_sentence.split(" "))
    print("Done reading file {}".format(file_name))
    print("Top 10 most common words in {} : {} ".format(file_name, " ".join(["{}: {}".format(x[0], x[1]) for x in input_words.most_common(10)])))
    return 

for f in f_list:
    word_freq(f)

Done reading file train.fr
Top 10 most common words in train.fr : : 528234 
: 226722 de: 165628 la: 86088 et: 83260 que: 73674 le: 69528 les: 69280 à: 68440 des: 56601 
Done reading file train.en
Top 10 most common words in train.en : the: 138141 to: 87325 of: 80031 and: 74748 a: 72975 that: 53779 in: 51261 I: 46685 is: 41978 you: 36267 
Done reading file dev.fr
Top 10 most common words in dev.fr : de: 786 la: 403 et: 344 le: 326 que: 313 à: 301 les: 275 des: 231 un: 218 en: 176 
Done reading file dev.en
Top 10 most common words in dev.en : the: 894 of: 611 to: 510 that: 379 a: 369 and: 367 is: 307 in: 252 you: 218 I: 197 
Done reading file tst2012.fr
Top 10 most common words in tst2012.fr : 
: 1124 de: 718 la: 413 que: 411 et: 396 les: 367 à: 362 nous: 352 le: 317 des: 304 
Done reading file tst2012.en
Top 10 most common words in tst2012.en : 
: 1124 the: 723 to: 542 a: 439 and: 420 of: 378 that: 356 we: 266 in: 262 you: 240 
Done reading file valid.fr
Top 10 most common words in vali

In [None]:
# from SLURM output files
# Read 178046 sentence pairs
# Filtered to 104875 pairs
# Reading lines...
# Read 887 sentence pairs
# Filtered to 494 pairs
# Trimmed from 104875 pairs to 93022, 0.8870 of total
\bf Dataset \bf Original Pairs & \bf Trimmed Pairs &\bf \% of Total \\
\hline 
Training & 178,046 & 104,875 & 58.9\% \\
Validation & 887 & 494 & 55.7\% \\
\hline