In [None]:
library(tidyverse)
library(data.table)
library(hash)
`%ni%` = Negate(`%in%`)

In [None]:
# path to concept directories
path = ".../concepts/"
concepts <- list.dirs(path=path, full = F, recursive = F)
# concepts <- concepts[! concepts %in% c("tmp", ".ipynb_checkpoints","TEMPLATES")]

# path to in/out-concept token file(s)
in.out.tokens = read.csv(".../AnnotatedTokens.csv", row.names = 1)

In [None]:
ins.prop <- function(utab,allann,tokens){
    
    #' ins.prop function
    #'
    #' Reconstructs variant proportions of the in-concept tokens
    #'
    #' utab = table of unifomity values
    #'
    #' all.ann = a dataframe containing all the annotated tokens in a concept (both in and out of concept). Must contain a $country column and a $type column containing the variants.   
    #'
    #' tokens = a dataframe with all tokens sampled for concept. Must contain a $country column and a $type column contaning the variants

    utab2 <- utab
    u.df <- data.frame(utab)
    real.val <- c()

    for (i in 1:nrow(u.df)){
        denom <- nrow(subset(allann,allann$country == as.character(u.df[i,2]) & allann$type == as.character(u.df[i,1])))
        full <- nrow(subset(tokens,tokens$country == as.character(u.df[i,2]) & tokens$type == as.character(u.df[i,1])))
    # formula: (no. of in-concept tokens for variant/no. of annotated tokens for variant)*no. of all sampled tokens for variant      
        real.val <- c(real.val,round((u.df[i,3]/denom)*full)) 
    }

    u.df$Freq <- real.val

    for (i in 1:length(utab2)){
        utab2[i] <- u.df$Freq[i]
    }

    utab2[is.nan(utab2)] <- 0

    return(utab2)
}

### Add any additional information to data frame object, such as type, country and/or register:

In [None]:
ann.toks <- mutate(in.out.tokens,token = rownames(in.out.tokens), 
                   type = factor(str_replace(token,"/.*/.*/.*$", "")), # get type
                   country = factor(str_replace(token,"^\\w*/./","")), 
                   register= factor(str_extract(country, "_.")), 
                   country = factor(str_replace(country,"_.*$","")), 
                   register= factor(str_extract(register, "[bg]"))
)
ann.toks

In [None]:
# list of concepts and variants
variants <- list('STUDENT' = c('alumna','alumno','estudiante'),'FOOLISH ACT' = c('boludez','pavada'), 'HUSBAND' = c('esposo','marido'), 'NICKNAME' = c('apodo','sobrenombre','mote','remoquete'),'WAITER' = c('camarera', 'camarero','mesera','mesero'))

In [None]:
# variants that require merging due to grammatical gender
var2merge <- hash()
var2merge[['WAITER']] <- c('camarer*', 'meser*')
var2merge[['STUDENT']] <- 'alumn*'

# Lectometric Calculations

In [None]:
all.calcs <- data.frame()

for (concept in concepts){
    
    sizes <- list()
    u.values <- list()
    iu.values <- list()
    
    csvFiles <- list.files(path=paste0(path,concept), pattern=".*csv$", full.names = TRUE)
    info <- file.info(csvFiles)
    csvFiles <- csvFiles[order(-info$size)] 
    largest <- head(csvFiles, 1)

    d <- read.csv(largest, quote="",header=F)
    colnames(d) <- c("token","cluster")

    model.name <- strsplit(largest, "/")[[1]][7]
    model <- str_replace_all(model.name, ".tcmx.soc.pac__cluster_assignment.csv","")

    d.token <- data.frame(do.call("rbind", strsplit(as.character(d$token),"/", fixed=TRUE)))
    d.new <- cbind.data.frame(d, d.token)
    colnames(d.new)[3:6] <- c("type", "pos", "file", "id")

    d.file <- data.frame(do.call("rbind", strsplit(as.character(d.new$file), "_|[0-9]"))[,1:2]) 
    colnames(d.file) <- c("country", "register")    
    d.full <- cbind.data.frame(d.new, d.file)  
    
    d.final <- subset(ann.toks, type %in% variants[[concept]])
    d.final.in <- subset(d.final, in_concept == 1)
    
    # add uniformity values ================================================================

    countries <- c("arg","col", "mex", "per", "spa", "usa")

    lectal.comp <- c()

    for (lect in countries){
        for (lect2 in countries){
            if (lect != lect2){

                if (paste0(lect2,"-",lect) %in% lectal.comp){
                    next
                }

                country.comparison <- c(lect,lect2)
                dsub <- droplevels(subset(d.final.in, country %in% country.comparison))

                if (concept %in% keys(var2merge)){
                    dsub2 <- mutate(dsub, type2 = factor(str_replace(type, "(.+)[aoe]$", "\\1"))) 
                    d.full <- mutate(d.full, type = factor(str_replace(type, "(.+)[aoe]$", "\\1")))  
                    ann <- mutate(d.final, type = factor(str_replace(type, "(.+)[aoe]$", "\\1")))  
                    utab <- table(dsub2$type2, dsub2$country)
                    utab <- ins.prop(utab, allann = ann, tokens = d.full)     
                }else{
                    utab <- table(dsub$type, dsub$country)
                    utab <- ins.prop(utab,allann = d.final, tokens = d.full)                    
                }
                print(utab)
                if (nrow(utab) == 1){
                    monolex = TRUE
                }else{
                    monolex = FALSE
                }

                if (ncol(utab) == 0){
                    has.na[[concept]] <- lectal.comp
                    u <- NA
                    lect.iu <- NA
                    lect2.iu <- NA
                }else if(ncol(utab) == 1){
                    has.na[[concept]] <- lectal.comp
                    u <- NA
                    if (colnames(utab) == lect){
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- NA
                    }else{
                        lect2.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect.iu <- NA
                    }
                }else if (monolex == FALSE){
                    if (fisher.test(utab, simulate.p.value = TRUE)$p > 0.05){  
                        u <- 1
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)
                    }else{
                        prop.utab <- prop.table(utab, 2)
                        u <- round(sum(apply(prop.utab, 1, min), na.rm=T), 10)
                        lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                        lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)  
                    } 
                }else{
                    prop.utab <- prop.table(utab, 2)
                    u <- round(sum(apply(prop.utab, 1, min), na.rm=T), 10)
                    lect.iu <- round(sum(prop.table(utab[,1])^2), 10)
                    lect2.iu <- round(sum(prop.table(utab[,2])^2), 10)  
                }

                sizes[paste0(lect,"-",lect2, " (sizes)")] <- sum(utab)
                u.values[paste0(lect,"-",lect2, " (U.values)")] <- u  
                iu.values[paste0(lect," (i-U.values)")] <- lect.iu
                iu.values[paste0(lect2," (i-U.values)")] <- lect2.iu 
                lectal.comp <- c(lectal.comp,paste0(lect,"-",lect2))
            }
        }
    }

    total.in.tokens <- nrow(d.final.in)
    df1 <- data.frame(colnames=names(u.values))
    df1 <- rbind(u.values)
    df2 <- data.frame(colnames=names(iu.values))
    df2 <- rbind(iu.values)
    df3 <- data.frame(colnames=names(sizes))
    df3 <- rbind(sizes)
    df.all <- cbind(df1,df2,df3)
    rownames(df.all) <- NULL
    
    if (concept %in% keys(var2merge)){
        total.variants <- (length(variants[[concept]])-length(var2merge[[concept]]))
    }else{
        total.variants <- length(variants[[concept]])
    }

    all.calcs <- rbind(all.calcs, cbind.data.frame(concept,total.variants,total.in.tokens,df.all))

}
all.calcs

In [None]:
# save as matrix (for MDS analysis)
write.csv(as.matrix(all.calcs),".../inconcept_Uvalues.csv", row.names = FALSE)