In [4]:
import glob
import matplotlib.pyplot as plt
import matplotlib
import matplotlib as mpl
import numpy as np
import os
import pandas as pd
import sys
from IPython.display import display

basedir = os.path.join('Y:\\', 'Dissertation')

# Reference and appendices for datasets
basedir = os.path.join('Y:\\', 'Dissertation')
vpdatadir = os.path.join(basedir, 'workspace','subs2vec', 'subs2vec', 'datasets', 'norms')
countdatadir = os.path.join(basedir, 'frequencies')
datasetsdir = os.path.join(basedir, 'datasets')

codepath = os.path.join(basedir, 'workspace', 'word2manylanguages', 'python')
sys.path.append(codepath)
import word2manylanguages as wm



In [9]:
from IPython.core.display import HTML
# van Paridon and Thompson datasets
files = glob.glob(vpdatadir + "\\*.tsv")
rows = []

for file in files:
    pathparts = file.split("\\")
    filename = pathparts[len(pathparts) - 1][0: file.index('.tsv')]
    parts = filename.split('-')
    language = parts[0]
    year = parts[len(parts) - 1].split('.')[0]
    name = ''
    for i in range(1,len(parts) - 1):
        name += parts[i] + ' '
        
    dataset = f'{name} ({year})'
    row = dict()
    row["Dataset"] = dataset
    row["Language"] = wm.code2lang[language]
    
    norms = pd.read_csv(file,sep='\t', comment='#',na_values=['-','–'])
    # Get the columns that hold the norms
    found = ''
    for col in norms.columns:
        if not col == 'word':
            found += col + ', '
                
     
    row["Norms"] = found[:len(found)-2]
    row["Words"] = len(norms)
    
    rows.append(row)
    
dataframe = pd.DataFrame(rows)   
dataframe.sort_values(by=['Dataset'], ascending=[True], inplace=True)
#display(dataframe)
display(HTML(dataframe.to_html(index=False, justify='left')))


Dataset,Language,Norms,Words
bakhtiar (2015),farsi,"age of acquisition, familiarity, imageability",871
binder (2016),english,"mean r, vision, bright, dark, color, pattern, large, small, motion, biomotion, fast, slow, shape, complexity, face, body, touch, temperature, texture, weight, pain, audition, loud, low, high, sound, music, speech, taste, smell, head, upperlimb, lowerlimb, practice, landmark, path, scene, near, toward, away, number, time, duration, long, short, caused, consequential, social, human, communication, self, cognition, benefit, harm, pleasant, unpleasant, happy, sad, angry, disgusted, fearful, surprised, drive, needs, attention, arousal, word length, word frequency, word log10 frequency",535
bonin (2018),french,"concreteness, context availability, valence, arousal",1659
brysbaert (2014),dutch,"age of acquisition, concreteness",25888
brysbaert (2014),english,concreteness,37058
brysbaert (2019),english,prevalence,61855
cameirao (2010),portuguese,age of acquisition,1749
chedid (2019a),french,familiarity,3596
chedid (2019b),french,"visual perceptual strength, auditory perceptual strength",3596
desrochers (2009),french,imageability,3600


In [13]:
# Frequecy datasets
files = glob.glob(countdatadir + "\\*.tsv")
rows = []

for file in files:
    pathparts = file.split("\\")
    filename = pathparts[len(pathparts) - 1][0: file.index('.tsv')]
    
    dataset = 'Wikipedia' if 'wiki' in filename else 'OpenSubtitles'
    language = filename[6:8]
    
    row = dict()
    row["Dataset"] = dataset
    row["Language"] = wm.code2lang[language]
    
    freqs = pd.read_csv(file,sep='\t',header=0)
    
    row["Words"] = len(freqs)
    
    rows.append(row)
    
dataframe = pd.DataFrame(rows)   
dataframe.sort_values(by=['Language', 'Dataset'], ascending=[True, True], inplace=True)
#display(dataframe)
display(HTML(dataframe.to_html(index=False, justify='left')))



Dataset,Language,Words
OpenSubtitles,afrikaans,18348
Wikipedia,afrikaans,472483
OpenSubtitles,albanian,268784
Wikipedia,albanian,678017
OpenSubtitles,arabic,2948365
Wikipedia,arabic,2777714
OpenSubtitles,armenian,6842
Wikipedia,armenian,1587461
OpenSubtitles,basque,154490
Wikipedia,basque,912341


In [11]:
# Additional datasets
files = glob.glob(datasetsdir + "\\*.csv")
rows = []

for file in files:
    pathparts = file.split("\\")
    dataset = pathparts[len(pathparts) - 1].split('.')[0]
    
    
    row = dict()
    row["Dataset"] = dataset
    
    try:
        norms = pd.read_csv(file,sep=',', comment='#',na_values=['-','–'])
        # Get the columns that hold the norms
        cols = wm.list_norm_columns(norms)
    
        langs = []
        for col in norms.columns:
            if col.startswith('word_'):
                 langs += [col[5:]]
        row["Language"] = langs[0]           
     
    
        found = ''
        for col in cols:
            found += col + ', '
            row["Norms"] = found[:len(found)-2]
            
        
        
        row["Words"] = len(norms)
    except:
        row["Language"] = 'unknown'           
     
        row["Norms"] = 'None'
        row["Words"] = 0
    
    rows.append(row)
    
dataframe = pd.DataFrame(rows)   
dataframe.sort_values(by=['Dataset'], ascending=[True], inplace=True)
#display(dataframe)
display(HTML(dataframe.to_html(index=False, justify='left')))


Dataset,Language,Norms,Words
Alario1999,spanish,"image_agreement_M, familiar_M, complexity_M, variability_M, aoa_M",400
Alonso2015,spanish,"aoa_M, oral_freq_log_M, written_freq_log_SUBTLEXESP_M, written_freq_log_LEXESP_M, written_freq_log_espal_M",7039
Alonso2016,spanish,aoa_M,4640
Altarriba1999,english,"concrete_M, contextavailability_M, imagine_M",326
Alvarez2007,spanish,aoa_M,328
Amsel2012,english,"familiar_M, pain_M, smell_M, color_M, taste_M, sound_M, grasp_M, motion_M",559
Bakhtiar2013,persian,"imageagree_M, familiar_M, visualcomplex_M, aoa_M, imageability_M",200
Barbarotto2005,italian,aoa_o_M,80
Bestgen2012,english,"arousal_LSA_M, arousal_norms_M, concrete_LSA_M, concrete_norms_M, dominate_LSA_M, dominate_norms_M, imagine_LSA_M, imagine_norms_M, valence_LSA_M, valence_norms_M",17350
Birchenough2017,german,"aoa_M, aoa_M_likert",3259
