In [1]:
library(tidyverse)
library(data.table)
library(hash)
`%ni%` = Negate(`%in%`)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0      [32m✔[39m [34mpurrr  [39m 1.0.1 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.5.0 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose


hash-2.2.6.2 provided by Decision Patterns



Attaching package: ‘hash’


The following object is masked from ‘package:data.table’:

    copy




In [2]:
# path to concept directories
path = ".../concepts/"
out.path = ".../uniformity_calculations.csv"

concepts <- list.dirs(path=path, full = F, recursive = F)
concepts <- concepts[! concepts %in% c("tmp", ".ipynb_checkpoints","TEMPLATES")]
length(concepts)

In [3]:
variants <- list('STUDENT' = c('alumna','alumno','estudiante'),'FOOLISH ACT' = c('boludez','pavada'), 'HUSBAND' = c('esposo','marido'), 'NICKNAME' = c('apodo','sobrenombre','mote','remoquete'),'WAITER' = c('camarera', 'camarero','mesera','mesero'))

In [4]:
# merge variants for gender
var2merge <- hash()
var2merge[['WAITER']] <- c('camarer*', 'meser*')
var2merge[['STUDENT']] <- 'alumn*'

# Lectometric Calculations

In [9]:
all.calcs <- data.frame()

for (concept in concepts){
    
    sizes <- list()
    u.values <- list() # uniformity values
    iu.values <- list() # internal uniformity
    
    
    # get all tokens from largest model ========================================================

    csvFiles <- list.files(path=paste0(path,concept), pattern=".*csv$", full.names = TRUE)
    info <- file.info(csvFiles)
    csvFiles <- csvFiles[order(-info$size)]
    largest <- head(csvFiles, 1)

    d <- read.csv(largest, quote="",header=F)
    colnames(d) <- c("token","cluster")

    model.name <- strsplit(largest, "/")[[1]][9]
    model <- str_replace_all(model.name, ".tcmx.soc.pac__cluster_assignment.csv","")

    token <- data.frame(do.call("rbind", strsplit(as.character(d$token),"/", fixed=TRUE)))
    d.new <- cbind.data.frame(d, token)
    colnames(d.new)[3:6] <- c("type", "pos", "file", "id")

    d.file <- data.frame(do.call("rbind", strsplit(as.character(d.new$file), "_|[0-9]"))[,1:2]) 
    colnames(d.file) <- c("country", "register")    
    d.final <- cbind.data.frame(d.new, d.file) 
    
    cat("----------------------\n",
        concept, "\n", 
        "Model:", model, "\n",
        "Total number of tokens:", nrow(d.final), "\n",
       "----------------------\n\n")

    # get uniformity values ================================================================

    countries <- c("arg","col", "mex", "per", "spa", "usa")
    lectal.comp <- c()

    for (lect in countries){
        for (lect2 in countries){
            if (lect != lect2){

                if (paste0(lect2,"-",lect) %in% lectal.comp){
                    next
                }

                country.comparison <- c(lect,lect2)
                d.sub <- droplevels(subset(d.final, country %in% country.comparison))
                
                # merge different forms of same variant
                if (concept %in% keys(var2merge)){
                    d.sub2 <- mutate(d.sub, type2 = factor(str_replace(type, "(.+)[aoe]$", "\\1")))  
                    (utab <- table(d.sub2$type2, d.sub2$country))
                }else{
                    (utab <- table(d.sub$type, d.sub$country))
                }

                # check if monolexical:
                if (nrow(utab) == 1){
                    monolex = TRUE
                }else{
                    monolex = FALSE
                }

                # calculate uniformity values:
                if (ncol(utab) == 0){
                    u <- NA
                    lect.iu <- NA
                    lect2.iu <- NA
                }else if(ncol(utab) == 1){
                    u <- NA
                    if (colnames(utab) == lect){
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- NA
                    }else{
                        lect2.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect.iu <- NA
                    }
                }else if (monolex == FALSE){
                    # run regression test over contingency table:
                    if (fisher.test(utab, simulate.p.value = TRUE)$p > 0.05){ 
                        # p-value greater than 0.05, U-value = 1
                        u <- 1
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)
                    }else{
                        prop.utab <- prop.table(utab, 2)
                        u <- round(sum(apply(prop.utab, 1, min), na.rm=T), 10)
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)  
                    } 
                }else{
                    prop.utab <- prop.table(utab, 2)
                    u <- round(sum(apply(prop.utab, 1, min), na.rm=T), 10)
                    lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                    lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)  
                }
                
                # list of calculations
                sizes[paste0(lect,"-",lect2, " (sizes)")] <- sum(utab)
                u.values[paste0(lect,"-",lect2, " (U.values)")] <- u  
                iu.values[paste0(lect," (i-U.values)")] <- lect.iu
                iu.values[paste0(lect2," (i-U.values)")] <- lect2.iu 
                lectal.comp <- c(lectal.comp,paste0(lect,"-",lect2))

            }
        }
    }
    
    # merge everything (total tokens, U-values, internal U-values and lectal comparison sizes) into one dataframe:

    total.tokens <- nrow(d.final)
    df1 <- data.frame(colnames=names(u.values))
    df1 <- rbind(u.values)
    df2 <- data.frame(colnames=names(iu.values))
    df2 <- rbind(iu.values)
    df3 <- data.frame(colnames=names(sizes))
    df3 <- rbind(sizes)

    df.all <- cbind(df1,df2,df3)
    rownames(df.all) <- NULL
   
    # adjusting for merged variants:
    if (concept %in% keys(var2merge)){
        total.variants <- (length(variants[[concept]])-length(var2merge[[concept]]))
    }else{
        total.variants <- length(variants[[concept]])
    }
    
    # bind all elements to get final dataframe:
    all.calcs <- rbind(all.calcs, cbind.data.frame(concept,total.variants, total.tokens,df.all))
}

all.calcs

----------------------
 FOOLISH ACT 
 Model: FOOLISH ACT.nobound5-5all.PPMIno.LENGTH5000.SOCPOSall 
 Total number of tokens: 617 
 ----------------------

----------------------
 HUSBAND 
 Model: HUSBAND.nobound15-15openclass.PPMIweight.LENGTHFOC.SOCPOSall 
 Total number of tokens: 14319 
 ----------------------

----------------------
 NICKNAME 
 Model: NICKNAME.nobound10-10openclass.PPMIweight.LENGTHFOC.SOCPOSall 
 Total number of tokens: 1417 
 ----------------------

----------------------
 STUDENT 
 Model: STUDENT.nobound10-10all.PPMIweight.LENGTH5000.SOCPOSall 
 Total number of tokens: 20033 
 ----------------------

----------------------
 WAITER 
 Model: WAITER.nobound15-15all.PPMIselection.LENGTHFOC.SOCPOSall 
 Total number of tokens: 1042 
 ----------------------



concept,total.variants,total.tokens,arg-col (U.values),arg-mex (U.values),arg-per (U.values),arg-spa (U.values),arg-usa (U.values),col-mex (U.values),col-per (U.values),⋯,col-mex (sizes),col-per (sizes),col-spa (sizes),col-usa (sizes),mex-per (sizes),mex-spa (sizes),mex-usa (sizes),per-spa (sizes),per-usa (sizes),spa-usa (sizes)
<chr>,<int>,<int>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,⋯,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>
FOOLISH ACT,2,617,1.0,1.0,0.7622561,1.0,1.0,1.0,1.0,⋯,33,42,32,62,53,43,73,52,82,72
HUSBAND,2,14319,0.7178621,0.7562315,0.6717009,0.927178,0.8274454,0.9616306,0.9538388,⋯,4944,4251,4211,4599,5339,5299,5687,4606,4994,4954
NICKNAME,4,1417,0.9615848,1.0,1.0,1.0,1.0,0.9272105,0.8818911,⋯,467,519,415,409,562,458,452,510,504,400
STUDENT,2,20033,0.6867801,1.0,0.9648038,0.9199712,0.8546999,0.6785704,0.7219763,⋯,7678,8536,7119,6950,7360,5943,5774,6801,6632,5215
WAITER,2,1042,0.4528371,0.5382675,0.5774771,0.87253,0.8561347,1.0,1.0,⋯,394,268,456,404,284,472,420,346,294,482


In [None]:
# write.csv(as.matrix(all.calcs),out.path, row.names = FALSE)