In [2]:
#libraries
library(tidyverse)

#Neuraltone data
#There are three types of IDs:
#Participant ID - ID used throughout study
#User ID - ID created by SQL server for participant
#Game ID - ID created by SQL for individual game play session
#Here, User ID will be used to match Participant ID to Game IDs. With Game ID data, exact play time can be calculated
#for each participant.

directory="Proseed_Data/"
master_sheet=read.csv("Proseed_Data/Participant Tracking Sheet.csv")
game_logs=read.csv(file="Proseed_Data/4_10_2019_loguser_activity.csv", header=TRUE)
game_userid=read.csv(file="Proseed_Data/4_10_2019_user_gameid.csv", header=TRUE)
tofu_4afc_aoi=read.csv(file="Proseed_Data/TOFU-cmu-AOIs-day3.csv", header=TRUE)
dir_list=dir(path=directory)

"package 'tidyverse' was built under R version 3.5.2"-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.0     v purrr   0.2.5
v tibble  2.0.1     v dplyr   0.7.8
v tidyr   0.8.2     v stringr 1.3.1
v readr   1.3.1     v forcats 0.3.0
"package 'forcats' was built under R version 3.5.2"-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()


In [3]:
MASTER=master_sheet
Mand_speak=c(0)
Class=c(0)
Train=c(0)
MSSD=c(0)
M4AFC=c(0)
Train_time=c(0)
TOFU_4AFC=c(0)
MASTER=cbind(MASTER, Mand_speak, Class, Train, MSSD, M4AFC, Train_time, TOFU_4AFC)

#Mandarin Speakers?
MASTER$Mand_speak[which(MASTER$ID>4000)]=1
#Classroom Instruction?
MASTER$Class[which(MASTER$ID>=2000&MASTER$ID<4000)]=1
#Training Instruction?
MASTER$Train[which(MASTER$ID<3000)]=1
MASTER$drop=master_sheet$drop
MASTER$incomplete=master_sheet$incomplete

Next, accuracy ratings will be collected and calculated from raw E-Prime .txt output files. The format of E-Prime uses for output data is somewhat esoteric. Luckily, several libraries exists specifically for formatting E-Prime data files to be used with R. The library I used is called "rprime".

Finally, we will need to collect and calculate accuracy ratings from the TOFU 4AFC task. This task is completed on each of the 3 days making up the artifical language (TOFU) task. For the purposes of this analysis, only accuracies on the 3rd and final day will be analyzed.

Unfortunately, determining accuracy on the TOFU 4AFC task is not as easy as it was for MSSD and Mand4AFC.

In [4]:
#Mandarin speech same different task
#
#
#
#ADD IN LOOP
#USE TRY CATCH, PRINT MISSING


library("rprime")

MSSD_data_ripper<-function(pID, dir)
    {
    MSSD.file=paste(sep="", dir, pID, "/AXtone-", pID, "-1.txt" )
    MSSD=to_data_frame(FrameList(read_eprime(MSSD.file)))
    #grab accuracies
    accuracies=as.numeric(na.omit(MSSD$Response.ACC))
    MSSD_total=length(accuracies)
    MSSD_acc=sum(accuracies)
    MSSD_acc_per=MSSD_acc/MSSD_total
    MSSD_bundle=c(MSSD_total, MSSD_acc, MSSD_acc_per)
    return(MSSD_bundle)
    }

#Mandarin 4AFC
#
#SAME AS ABOVE
#
M4AFC_data_ripper<-function(pID, dir)
    {
    M4AFC.file=paste(sep="", dir, pID, "/4AFC-", pID, "-1.txt" )
    M4AFC=to_data_frame(FrameList(read_eprime(M4AFC.file)))
    #grab response, replace NA with 0
    response=as.numeric(na.omit(M4AFC$Response.RESP))
    response[is.na(response)]<-0
    #grab correct response, omit NA (which shouldnt be in there to begin with)
    corr_response=as.numeric(na.omit(M4AFC$Response.CRESP))
    #get accuracy by finding matches between resp and correct resp
    M4AFC_acc<-corr_response==response
    #build matrix with zeros then apply accuracy matrix to insert 1's
    response[-M4AFC_acc]<-0
    response[M4AFC_acc]<-1
    #sum 1's to get total correct
    M4AFC_acc=sum(response)
    #divide by length of corr_response (again, should have no NA) to get percent accuracy
    M4AFC_total=length(corr_response)
    M4AFC_acc_per=M4AFC_acc/M4AFC_total
    M4AFC_bundle=c(M4AFC_total, M4AFC_acc, M4AFC_acc_per)
    return(M4AFC_bundle)
    }

#TOFU 4AFC acc ripper

T4AFC_data_ripper<-function (pID, directory)
{
    t4AFC=paste(sep="",directory, pID,"/Day1-", pID, ".txt" )
    t4AFC_file=to_data_frame(FrameList(read_eprime(t4AFC)))
    X=as.numeric(na.omit(t4AFC_file$XTrackedFinal))
    Y=as.numeric(na.omit(t4AFC_file$YTrackedFinal))
    ans_total=length(X)
    ans_holder=vector()
    for (j in 1:ans_total)
        {
            if (X[j]<960) #left side
                {
                if (Y[j]<540)
                    {
                    ans_holder[j]=1 #left top
                    }
                if (Y[j]>540)
                    {
                    ans_holder[j]=4 #left bottom
                    }
                }
            if (X[j]>960) #right side
                {
                if (Y[j]<540)
                    {
                    ans_holder[j]=2 #right top
                    }
                if (Y[j]>540)
                    {
                    ans_holder[j]=3 #right bottom
                    }
                }        
        }
    t4afc_acc_TF=ans_holder==tofu_4afc_aoi$targetAOI
    t4afc_acc=rep(0,ans_total)
    t4afc_acc[t4afc_acc_TF]=1
    t4afc_acc=sum(t4afc_acc)
    t4afc_acc_per=sum(t4afc_acc)/ans_total
    t4afc_bundle=c(ans_total, t4afc_acc, t4afc_acc_per)
    return(t4afc_bundle)
}

"package 'rprime' was built under R version 3.5.3"

Now we will crawl through the directory and apply the above functions to subjects' data file

In [61]:
num_subjs=length(MASTER$ID)

drops=MASTER[which(MASTER$drop==1),]
drops=drops$ID

incompletes=MASTER[which(MASTER$incomplete==1),]
incompletes=incompletes$ID

MSSD_total=vector()
MSSD_acc=vector()
MSSD_acc_per=vector()
        
M4AFC_total=vector()
M4AFC_acc=vector()
M4AFC_acc_per=vector()
        
t4AFC_total=vector()
t4AFC_acc=vector()
t4AFC_acc_per=vector()


num_subjs=length(MASTER$ID)
for (k in 1:num_subjs)
    {
    pID=toString(MASTER$ID[k])   
    #print(pID)
    #for drops, set all values to NA
    if (pID %in% drops)
        {
        MSSD_total[k]=NA
        MSSD_acc[k]=NA
        MSSD_acc_per[k]=NA
        
        M4AFC_total[k]=NA
        M4AFC_acc[k]=NA
        M4AFC_acc_per[k]=NA
        
        t4AFC_total[k]=NA
        t4AFC_acc[k]=NA
        t4AFC_acc_per[k]=NA
        }
    #GRAB MSSD and 4AFC first
    else
        {
        #Mandarin speakers have MSSD & M4AFC data, however they begin 41.. not 40, need to check
        if (as.numeric(pID)>4000)
            {
            pID=toString(as.numeric(pID)+100)
            }
        MSSD_bundle=MSSD_data_ripper(pID, directory)
        MSSD_total[k]=MSSD_bundle[1]
        MSSD_acc[k]=MSSD_bundle[2]
        MSSD_acc_per[k]=MSSD_bundle[3]
        
        M4AFC_bundle=M4AFC_data_ripper(pID, directory)
        M4AFC_total[k]=M4AFC_bundle[1]
        M4AFC_acc[k]=M4AFC_bundle[2]
        M4AFC_acc_per[k]=M4AFC_bundle[3]
        #Change Mandarin speakers back for t4afc
        if (as.numeric(pID)>4000)
            {
            pID=toString(as.numeric(pID)-100)
            }
        #GRAB t4Aafc
        if(pID %in% incompletes)
            {
            t4AFC_total[k]=NA
            t4AFC_total[k]=NA
            t4AFC_total[k]=NA 
            }
        else
            {
            for (q in 1:3)
                {
                day_inc=100
                pID=toString(as.numeric(pID)+day_inc)
                print(pID)
                T4AFC_bundle=T4AFC_data_ripper(pID, directory)
                t4AFC_total[k]=T4AFC_bundle[1]
                t4AFC_acc[k]=T4AFC_bundle[2]
                t4AFC_acc_per[k]=T4AFC_bundle[3]
                }
            }
        }
    }

crawled_data=cbind(MSSD_total, MSSD_acc, MSSD_acc_per, M4AFC_total, M4AFC_acc, M4AFC_acc_per, t4AFC_total, t4AFC_acc, t4AFC_acc_per)
MASTER=cbind(MASTER,crawled_data)

[1] "1101"
[1] "1201"
[1] "1301"
[1] "1102"
[1] "1202"
[1] "1302"
[1] "1106"
[1] "1206"
[1] "1306"
[1] "2103"
[1] "2203"
[1] "2303"
[1] "2104"


ERROR: Error in t4afc_acc[-t4afc_acc_TF] <- 0: only 0's may be mixed with negative subscripts


Now we can begin our first analysis. This analysis will test our first hypothesis that there are no differences between non-native Mandarin speakers who are receiving Classroom Mandarin instruction versus those who are not, with regards to performance on the Mandarin Speech Same Differnt task or the Mandarin tone 4-item Alternative Forced Choice Task.

This is an "accept the null" hypothesis. A Bayes Factor analysis between the classroom and non-classroom groups will fit our needs here.

In [57]:
#ISSUE WITH M4AFC_acc_per length, shorter than Class

library("BayesFactor")

#remove instances where there is no M4AFC_data
#remove instances where there is no tAFC_data

M4AFC_data=MASTER[which(MASTER$t4AFC_acc_per>0),]
#remove Mandarin speakers
M4AFC_data=M4AFC_data[which(M4AFC_data$Mand_speak<1),]
hyp1.fit=lm(M4AFC_acc_per~Class+Pitch_Discrimination, M4AFC_data)
summary(hyp1.fit)
confint(hyp1.fit)
hyp1.regBF=regressionBF(M4AFC_acc_per~Class+Pitch_Discrimination, M4AFC_data)
1/hyp1.regBF


Call:
lm(formula = M4AFC_acc_per ~ Class + Pitch_Discrimination, data = M4AFC_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.18633 -0.12736 -0.09119  0.16470  0.31294 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)           0.51071    0.10716   4.766 0.000762 ***
Class                 0.30180    0.12789   2.360 0.039960 *  
Pitch_Discrimination -0.01005    0.00397  -2.532 0.029751 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1835 on 10 degrees of freedom
Multiple R-squared:  0.4746,	Adjusted R-squared:  0.3695 
F-statistic: 4.516 on 2 and 10 DF,  p-value: 0.04005


Unnamed: 0,2.5 %,97.5 %
(Intercept),0.27193761,0.74948636
Class,0.01685162,0.58675541
Pitch_Discrimination,-0.01889713,-0.00120755


                denominator
numerator           Class Pitch_Discrimination Class + Pitch_Discrimination
  Intercept only 1.271339             1.042852                    0.4353125

In [26]:
get_game_ids <- function(ID_ind, id_lookup)
    {
    part_gameids=game_userid$id[which(id_lookup$User_ID[ID_ind]==game_userid$user_id)]
    return(part_gameids)
    }

get_game_time_data <- function(game_id)
    {
        time_data=game_logs$time_elapsed[which(game_logs$game_id==game_id)]
        time_data=tail(time_data, n=1)
        if(length(time_data)==0) #if gameid does not exist, will get numeric(0). need to make 0
            {
            time_data=0
            }
        return(time_data)
    }

In [37]:
ID_part_user=c("ID", "User_ID")
User_id_lookup=na.omit(MASTER[ID_part_user])

num_users=nrow(User_id_lookup)
part_total_game_time=vector()

for(i in 1:num_users)
    {
    game_ids_4_part=get_game_ids(i, User_id_lookup)
    
    game_time_summer=0
    num_games=length(game_ids_4_part)
    for(j in 1:num_games)
        {
        game_time_data=get_game_time_data(game_ids_4_part[j])
        game_time_summer=game_time_summer+game_time_data
        }
    part_total_game_time[i]=game_time_summer
    }

#convert seconds to minutes, divide by 60
game_time_mins=part_total_game_time/60

User_id_lookup=cbind(User_id_lookup, game_time_mins)

ID,Pitch_Discrimination,User_ID,drop,incomplete,old_ID,Mand_speak,Class,Train,MSSD,...,MSSD_total,MSSD_acc,MSSD_acc_per,M4AFC_total,M4AFC_acc,M4AFC_acc_per,t4AFC_total,t4AFC_acc,t4AFC_acc_per,merge_2_master
1001,2.1,123.0,,,,0,0,1,0,...,196.0,186.0,0.9489796,200.0,71.0,0.355,48.0,20.0,0.4166667,69.25953
1002,2.85,120.0,,,,0,0,1,0,...,196.0,155.0,0.7908163,200.0,159.0,0.795,48.0,22.0,0.4583333,69.25953
1006,7.2,111.0,,,,0,0,1,0,...,196.0,189.0,0.9642857,200.0,52.0,0.26,48.0,30.0,0.625,69.25953
1021,2.24,133.0,,1.0,,0,0,1,0,...,196.0,195.0,0.994898,200.0,149.0,0.745,,,,69.25953
1028,3.6,135.0,,1.0,,0,0,1,0,...,196.0,183.0,0.9336735,200.0,63.0,0.315,,,,69.25953
1029,39.0,,1.0,,,0,0,1,0,...,,,,,,,,,,
1030,15.6,128.0,,1.0,,0,0,1,0,...,196.0,142.0,0.7244898,200.0,58.0,0.29,,,,69.25953
1031,1.95,130.0,,1.0,,0,0,1,0,...,196.0,188.0,0.9591837,200.0,61.0,0.305,,,,69.25953
1034,18.0,129.0,,1.0,,0,0,1,0,...,196.0,154.0,0.7857143,200.0,38.0,0.19,,,,69.25953
2001,,,1.0,,,0,1,1,0,...,,,,,,,,,,
