## Things Related to Vocabulary and Corpus Frequency

In [1]:
""" imports """
import utils
import default

import os
import GWOT
import csv
from pprint import pprint
from pandas import DataFrame

from scipy.stats import spearmanr
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
freqsfile = "data/decFreq1929.tsv"

rowlists = []

with open(freqsfile, newline='') as f:
    _ = f.readline()  # the header is useless
    tsvin = csv.reader(f, delimiter='\t')
    for row in tsvin:
        rowlists.append([float(v) for v in row[1:-1]])

In [4]:
columns = [sum(t) for t in list(zip(*rowlists))]
pprint(list(zip(default.RSC_YEARS, columns)))


[(1660, 455351.0),
 (1670, 831299.0),
 (1680, 573090.0),
 (1690, 723484.0),
 (1700, 780813.0),
 (1710, 489909.0),
 (1720, 538208.0),
 (1730, 600083.0),
 (1740, 1006220.0),
 (1750, 1179116.0),
 (1760, 972747.0),
 (1770, 1501489.0),
 (1780, 1354209.0),
 (1790, 1332896.0),
 (1800, 1613384.0),
 (1810, 1444050.0),
 (1820, 1405686.0),
 (1830, 2608192.0),
 (1840, 2023599.0),
 (1850, 4599892.0),
 (1860, 5872125.0),
 (1870, 7615360.0),
 (1880, 8626332.0),
 (1890, 10196869.0),
 (1900, 10334627.0),
 (1910, 8891529.0),
 (1920, 13583475.0)]


#### investigating how much vocabulary overlap there is in the incremental spaces (12.8.20)

In [6]:
df = DataFrame({"year1":{},
                "year2":{},
                "freq1":{},
                "freq2":{},
                "overlap":{}})

min_count = 1

for year1, year2 in default.SPACE_PAIR_SELECTION:
    c1 = utils.rsc_freqfile_column(year1)
    c2 = utils.rsc_freqfile_column(year2)
    
    freq1, freq2 = utils.get_freqdists_from_file(freqsfile, c1, c2)
    
    realfreq1 = {k:v for k,v in freq1.items() if v>=min_count}
    realfreq2 = {k:v for k,v in freq2.items() if v>=min_count}
    
    df = df.append({"year1":year1, 
               "year2":year2, 
               "freq1":len(realfreq1), 
               "freq2":len(realfreq2),
               "overlap":len(set(realfreq1).intersection(set(realfreq2)))}, 
              ignore_index=True)

df

reading frequencies: 462166it [00:00, 484481.25it/s]
reading frequencies: 462166it [00:00, 503577.24it/s]
reading frequencies: 462166it [00:00, 521022.77it/s]
reading frequencies: 462166it [00:00, 504145.91it/s]
reading frequencies: 462166it [00:00, 524635.50it/s]
reading frequencies: 462166it [00:00, 524736.34it/s]
reading frequencies: 462166it [00:00, 526645.09it/s]
reading frequencies: 462166it [00:00, 529622.00it/s]
reading frequencies: 462166it [00:00, 526345.93it/s]
reading frequencies: 462166it [00:00, 527568.14it/s]
reading frequencies: 462166it [00:00, 487801.77it/s]
reading frequencies: 462166it [00:00, 474993.98it/s]


Unnamed: 0,year1,year2,freq1,freq2,overlap
0,1740.0,1750.0,27243.0,29166.0,17587.0
1,1750.0,1760.0,29166.0,25237.0,16499.0
2,1680.0,1710.0,21440.0,21276.0,12032.0
3,1710.0,1740.0,21276.0,27243.0,13527.0
4,1740.0,1770.0,27243.0,32163.0,17233.0
5,1770.0,1800.0,32163.0,30795.0,15789.0
6,1800.0,1830.0,30795.0,47253.0,19580.0
7,1830.0,1860.0,47253.0,79880.0,32705.0
8,1860.0,1890.0,79880.0,113550.0,48228.0
9,1700.0,1800.0,24317.0,30795.0,11843.0


In [11]:
# how much do pairwise frequencies differ?
from tqdm import tqdm

freqfile = "data/decFreq1929.tsv"
min_count = 5
year1 = 1740
year2 = 1770

c1 = utils.rsc_freqfile_column(year1)
c2 = utils.rsc_freqfile_column(year2)

freq1, freq2 = utils.get_freqdists_from_file(freqfile, c1, c2)

realfreq1 = {k:v for k,v in freq1.items() if v>=min_count}
realfreq2 = {k:v for k,v in freq2.items() if v>=min_count}

diff = {w:realfreq2[w]-realfreq1[w] for w in set(realfreq1.keys()).intersection(set(realfreq2.keys()))}

reading frequencies: 462166it [00:00, 508752.94it/s]


In [12]:
increased = sorted(diff, key=diff.get, reverse=True)[:50]
decreased = sorted(diff, key=diff.get, reverse=False)[:50]

for ki,kd in zip(increased, decreased):
    print(f"{diff[ki]:>4.1f}  {ki:<20}     {diff[kd]:>4.1f}  {kd:<20}")

38832.0  the                      -893.0  he                  
31790.0  ,                        -620.0  his                 
20900.0  of                       -456.0  tho                 
15793.0  <\s>                     -394.0  :                   
10200.0  in                       -391.0  stone               
9383.0  to                       -377.0  him                 
7719.0  and                      -377.0  thro                
7382.0  OTHER                    -377.0  her                 
6716.0  a                        -317.0  `                   
4969.0  is                       -308.0  author              
4905.0  it                       -274.0  felt                
4605.0  be                       -269.0  fibres              
4028.0  that                     -260.0  she                 
3620.0  2                        -245.0  muscle              
3439.0  by                       -245.0  read                
3347.0  which                    -236.0  like                
331