In [1]:
import os
import pandas as pd
import numpy as np
import sklearn.linear_model
import sklearn.model_selection
import sklearn.preprocessing
import sklearn.utils

In [2]:
basedir = os.path.join('Y:\\', 'Dissertation')
countdir = "frequencies"
modeldir = 'models'
evaldir = 'count_evals'
scoredir = 'count_scores'

scorepath = os.path.join(basedir, scoredir)
if not os.path.exists(scorepath):
    os.makedirs(scorepath)
    
evalpath = os.path.join(basedir, evaldir)
if not os.path.exists(evalpath):
    os.makedirs(evalpath)

In [3]:
lang = 'en'
dim = 50
win = 1
alg = 'sg'
# en_50_1_sg_wxd

base_file_name = f'{lang}_{str(dim)}_{str(win)}_{alg}'
input_path = os.path.join(basedir, modeldir, f'{base_file_name}_wxd.csv')
print("Loading model " + base_file_name + " from " + input_path)

with open(input_path, 'r', encoding='utf-8') as vecfile:
    # skip header
    next(vecfile)
    # initialize arrays
    vectors = np.zeros((10000000, dim))
    words = np.empty(10000000, dtype=object) # fill arrays
    for i, line in enumerate(vecfile):
        # Limit to 10 million, although it looks like 7.5 million is the largest
        if i >= 10000000:
            break
        #print(line)
        rowentries = line.rstrip('\n').split(',')
        words[i] = rowentries[0].casefold()
        vectors[i] = rowentries[1:dim+1]

    # truncate empty part of arrays, if necessary
    vectors = vectors[:i]
    words = words[:i] 

    # normalize by L1 norm
    vectors = vectors / np.linalg.norm(vectors, axis=1).reshape(-1, 1)

    wordsXdims = pd.DataFrame(vectors)
    wordsXdims.set_index(words, inplace=True) 

Loading model en_50_1_sg from Y:\Dissertation\models\en_50_1_sg_wxd.csv


In [4]:
langfile=f'dedup.{lang}.words.unigrams.tsv'
datasetspath = os.path.join(basedir,countdir)

print('Loading ' + langfile)
datapath = os.path.join(datasetspath,langfile)
freqs = pd.read_csv(datapath,sep='\t', comment='#', na_values=['-','–'])
freqs.set_index('unigram', inplace=True)

Loading dedup.en.words.unigrams.tsv


In [5]:
freqs_index = list(freqs.index.values)
wxd_index = list(wordsXdims.index.values)

missing = []
n = 0
for f in freqs_index:
    if f not in wxd_index: 
        missing.append(f)
        n += 1
        if (n >= 25): break

In [6]:
print(missing)

[nan, nan, 'ß', nan, 'ﬂoor', 'ﬁrst', 'ﬂy', 'ﬁnd', 'okay\xa0', 'µ', 'ﬂ', 'ﬂight', 'ﬂying', 'ﬂowers', 'huh\xa0', 'what\xa0', 'ﬁne', 'ﬂat', 'ﬁre', 'ﬂower', 'ﬂesh', 'ﬂag', 'ﬁght', 'ﬁve', 'riﬂe']


In [10]:
import unicodedata

i = 0
while i < len(missing):
    if (isinstance(missing[i], str)):
        missing[i] = unicodedata.normalize("NFKD", missing[i])
    i += 1
    
print(missing)


[nan, nan, 'ß', nan, 'floor', 'first', 'fly', 'find', 'okay ', 'μ', 'fl', 'flight', 'flying', 'flowers', 'huh ', 'what ', 'fine', 'flat', 'fire', 'flower', 'flesh', 'flag', 'fight', 'five', 'rifle']


In [9]:
type(missing[0])

float

In [11]:
i = 0
while i < len(freqs_index):
    if (isinstance(freqs_index[i], str)):
        freqs_index[i] = unicodedata.normalize("NFKD", freqs_index[i])
    i += 1

In [13]:
freqs.index = freqs_index

df = freqs.join(wordsXdims, how='inner')
    
# compensate for missing ys somehow
total = len(freqs)
missing = len(freqs) - len(df)
penalty = (total - missing) / total
print(f'vectors: {len(wordsXdims)}  freqs: {total}  matches: {len(df)}')
print(f'missing vectors for {missing} out of {total} words')

vectors: 7577800  freqs: 2397981  matches: 1360559
missing vectors for 1037422 out of 2397981 words


In [14]:
freqs_index = list(freqs.index.values)

missing = []
n = 0
for f in freqs_index:
    if f not in wxd_index: 
        missing.append(f)
        n += 1
        if (n >= 25): break

In [15]:
print(missing)

[nan, 'yöu', 'âa', 'yöur', 'ü', 'fiancée', 'fiancé', 'café', 'señor', 'é', nan, 'ó', 'josé', 'à', 'â', 'führer', 'françois', 'andré', 'cliché', 'não', 'résumé', 'rené', 'sátur', 'maría', 'qué']


In [16]:
from unidecode import unidecode

i = 0
while i < len(freqs_index):
    if (isinstance(freqs_index[i], str)):
        freqs_index[i] = unidecode(freqs_index[i])
    i += 1

In [17]:
freqs.index = freqs_index

df = freqs.join(wordsXdims, how='inner')
    
# compensate for missing ys somehow
total = len(freqs)
missing = len(freqs) - len(df)
penalty = (total - missing) / total
print(f'vectors: {len(wordsXdims)}  freqs: {total}  matches: {len(df)}')
print(f'missing vectors for {missing} out of {total} words')

vectors: 7577800  freqs: 2397981  matches: 1476478
missing vectors for 921503 out of 2397981 words


In [18]:
freqs_index = list(freqs.index.values)

missing = []
n = 0
for f in freqs_index:
    if f not in wxd_index: 
        missing.append(f)
        n += 1
        if (n >= 25): break

In [19]:
print(missing)


[nan, nan, nan, 'you ', 'right ', 'atrademarka', 'yyand', 'it ', '1/2', 'that ', 'aeaeaaduoae', 'me ', 'yythe', 'okay ', 'here ', 'yyit', 'yyyyyyi', 'yyto', 'yyyi', 'yyyyi', 'yywe', 'i1/2', 'varnaes', 'this ', 'enerything']


In [20]:
i = 0
while i < len(freqs_index):
    if (isinstance(freqs_index[i], str)):
        freqs_index[i] = freqs_index[i].strip()
    i += 1

In [21]:
freqs.index = freqs_index

df = freqs.join(wordsXdims, how='inner')
    
# compensate for missing ys somehow
total = len(freqs)
missing = len(freqs) - len(df)
penalty = (total - missing) / total
print(f'vectors: {len(wordsXdims)}  freqs: {total}  matches: {len(df)}')
print(f'missing vectors for {missing} out of {total} words')

vectors: 7577800  freqs: 2397981  matches: 1490117
missing vectors for 907864 out of 2397981 words


In [22]:
freqs_index = list(freqs.index.values)

missing = []
n = 0
for f in freqs_index:
    if f not in wxd_index: 
        missing.append(f)
        n += 1
        if (n >= 25): break

In [23]:
print(missing)

[nan, nan, nan, 'atrademarka', 'yyand', '1/2', 'aeaeaaduoae', 'yythe', 'yyit', 'yyyyyyi', 'yyto', 'yyyi', 'yyyyi', 'yywe', 'i1/2', 'varnaes', 'enerything', 'woulive', 'seoor', 'yythat', 'korsbaek', 'yybut', 'coulive', 'shoulive', 'yywhat']


In [24]:
freqs = freqs[freqs.unigram_freq >= 5]
df = freqs.join(wordsXdims, how='inner')
    
# compensate for missing ys somehow
total = len(freqs)
missing = len(freqs) - len(df)
penalty = (total - missing) / total
print(f'vectors: {len(wordsXdims)}  freqs: {total}  matches: {len(df)}')
print(f'missing vectors for {missing} out of {total} words')

vectors: 7577800  freqs: 406868  matches: 761046
missing vectors for -354178 out of 406868 words
