Contains the R models used to analyze the number of review comments before and after the introduction of Travis CI. Looks at whether a boolean variable `IsAfterTravisIntroduction` can be used to predict the number of review comments under a pull request. 

In [10]:
filename <- 'generated/num_of_review_comments.csv'

NumOfReviewCommentsData <- read.csv(file=filename, header=TRUE, sep=",")

summary(NumOfReviewCommentsData)

 EffectiveComments  PseudoEffectiveBuilds TotalReviewComments
 Min.   :  0.0000   Min.   : 0.00000      Min.   :  0.000    
 1st Qu.:  0.0000   1st Qu.: 0.00000      1st Qu.:  0.000    
 Median :  0.0000   Median : 0.00000      Median :  0.000    
 Mean   :  0.2918   Mean   : 0.07549      Mean   :  1.296    
 3rd Qu.:  0.0000   3rd Qu.: 0.00000      3rd Qu.:  0.000    
 Max.   :259.0000   Max.   :59.00000      Max.   :420.000    
                                                             
 TotalReviewCommentThreads ShareReviewComments GeneralComments   
 Min.   :  0.000           Min.   :  0.00      Min.   :   0.000  
 1st Qu.:  0.000           1st Qu.:  0.00      1st Qu.:   0.000  
 Median :  0.000           Median :  0.00      Median :   1.000  
 Mean   :  1.104           Mean   : 10.47      Mean   :   2.752  
 3rd Qu.:  0.000           3rd Qu.:  0.00      3rd Qu.:   3.000  
 Max.   :404.000           Max.   :100.00      Max.   :1035.000  
                                          

In [11]:
library(lmerTest)
library(MuMIn)
library(VIF)
library(sqldf)

vif.mer <- function (fit) {
    ## adapted from rms::vif
    
    v <- vcov(fit)
    nam <- names(fixef(fit))

    ## exclude intercepts
    ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
    if (ns > 0) {
        v <- v[-(1:ns), -(1:ns), drop = FALSE]
        nam <- nam[-(1:ns)]
    }
    
    d <- diag(v)^0.5
    v <- diag(solve(v/(d %o% d)))
    names(v) <- nam
    v
}

In [14]:
hasReviewComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where TotalReviewCommentThreads > 0")

hasReviewCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where TotalReviewCommentThreads > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where TotalReviewCommentThreads > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

hasReviewComments <- hasReviewComments[hasReviewComments$TotalReviewCommentThreads < quantile(hasReviewComments$TotalReviewCommentThreads, 0.95), ]
hasReviewCommentsAndFromOutsider <- hasReviewCommentsAndFromOutsider[hasReviewCommentsAndFromOutsider$TotalReviewCommentThreads < quantile(hasReviewCommentsAndFromOutsider$TotalReviewCommentThreads, 0.95), ]

# Clear unused factors
hasReviewCommentsAndFromOutsider$ProjectName <- factor(hasReviewCommentsAndFromOutsider$ProjectName)
hasReviewCommentsAndFromOutsider$ProjectLanguage <- factor(hasReviewCommentsAndFromOutsider$ProjectLanguage)
hasReviewCommentsAndFromOutsider$IsMerged <- factor(hasReviewCommentsAndFromOutsider$IsMerged)

hasGeneralComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0")

hasGeneralCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where GeneralComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasGeneralCommentsAndFromOutsider$ProjectName <- factor(hasGeneralCommentsAndFromOutsider$ProjectName)
hasGeneralCommentsAndFromOutsider$ProjectLanguage <- factor(hasGeneralCommentsAndFromOutsider$ProjectLanguage)
hasGeneralCommentsAndFromOutsider$IsMerged <- factor(hasGeneralCommentsAndFromOutsider$IsMerged)

summary(hasGeneralCommentsAndFromOutsider)

hasEffectiveComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0")

print(nrow(hasEffectiveComments))

hasEffectiveCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where EffectiveComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasEffectiveCommentsAndFromOutsider$ProjectName <- factor(hasEffectiveCommentsAndFromOutsider$ProjectName)
hasEffectiveCommentsAndFromOutsider$ProjectLanguage <- factor(hasEffectiveCommentsAndFromOutsider$ProjectLanguage)
hasEffectiveCommentsAndFromOutsider$IsMerged <- factor(hasEffectiveCommentsAndFromOutsider$IsMerged)

summary(hasEffectiveCommentsAndFromOutsider)

 EffectiveComments  PseudoEffectiveBuilds TotalReviewComments
 Min.   :  0.0000   Min.   : 0.00000      Min.   :  0.000    
 1st Qu.:  0.0000   1st Qu.: 0.00000      1st Qu.:  0.000    
 Median :  0.0000   Median : 0.00000      Median :  0.000    
 Mean   :  0.3992   Mean   : 0.07476      Mean   :  1.582    
 3rd Qu.:  0.0000   3rd Qu.: 0.00000      3rd Qu.:  0.000    
 Max.   :259.0000   Max.   :59.00000      Max.   :420.000    
                                                             
 TotalReviewCommentThreads ShareReviewComments GeneralComments   
 Min.   :  0.000           Min.   : 0.00       Min.   :   1.000  
 1st Qu.:  0.000           1st Qu.: 0.00       1st Qu.:   1.000  
 Median :  0.000           Median : 0.00       Median :   2.000  
 Mean   :  1.356           Mean   :10.07       Mean   :   4.223  
 3rd Qu.:  0.000           3rd Qu.: 0.00       3rd Qu.:   5.000  
 Max.   :351.000           Max.   :99.72       Max.   :1035.000  
                                          

[1] 21451


 EffectiveComments PseudoEffectiveBuilds TotalReviewComments
 Min.   :  1.000   Min.   : 0.0000       Min.   :  1.0      
 1st Qu.:  1.000   1st Qu.: 0.0000       1st Qu.:  2.0      
 Median :  2.000   Median : 0.0000       Median :  5.0      
 Mean   :  4.623   Mean   : 0.2468       Mean   : 10.1      
 3rd Qu.:  4.000   3rd Qu.: 0.0000       3rd Qu.: 11.0      
 Max.   :259.000   Max.   :39.0000       Max.   :420.0      
                                                            
 TotalReviewCommentThreads ShareReviewComments GeneralComments  
 Min.   :  1.000           Min.   :  1.19      Min.   :  0.000  
 1st Qu.:  2.000           1st Qu.: 33.33      1st Qu.:  2.000  
 Median :  4.000           Median : 54.55      Median :  4.000  
 Mean   :  8.685           Mean   : 55.93      Mean   :  7.776  
 3rd Qu.:  9.000           3rd Qu.: 77.78      3rd Qu.:  9.000  
 Max.   :343.000           Max.   :100.00      Max.   :279.000  
                                                         

In [15]:
determineImpact <- function(model, trainingData, target, interested_in) {        
    numVals = 0
    
    meanVals <- data.frame()

    if(class(trainingData[[interested_in]]) != 'factor'){
        print("Cannot do this for a non-factor")
        return
    } else {
        numVals = length(levels(trainingData[[interested_in]]))

        meanVals <- data.frame(matrix(ncol=1, nrow=numVals))
        
        colnames(meanVals)<- c(interested_in)
        
        meanVals[[interested_in]] = levels(trainingData[[interested_in]])
    }
    
    
    for(columnName in all.vars(as.formula(model))) {
        if(columnName == target) {
            next
        }
        if(columnName == interested_in) {
            next
        }
        
        if(class(trainingData[[columnName]]) == 'factor') {
            meanVals[[columnName]] = rep(c(levels(trainingData[[columnName]])[1]), numVals)
        } else {
            meanVals[[columnName]] = rep(c(median(trainingData[[columnName]])), numVals)
        }
    }
    
    print(meanVals)
    
    
    print("Order is: ")
    print(meanVals[[interested_in]])
    
    print("Predict is:")
    print(predict(model, meanVals, type="response"))
}

In [6]:
zerosdf <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments = 0")
print(nrow(NumOfReviewCommentsData))
print(nrow(zerosdf))

sampledZerosdf <- zerosdf[sample(nrow(zerosdf), 15000),]

print(nrow(sampledZerosdf))

combined <- rbind(sampledZerosdf, hasEffectiveComments)

print(nrow(combined))

hurdleModel = glm((EffectiveComments > 0) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            #log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(TotalReviewComments + 1) +
            log(GeneralComments + 1) +
            log(CommitsAfterCreate + 1) +
            log(AdditionsAfterCreate + 1) +
            log(DeletionsAfterCreate + 1) +
            IsAfter ,
          data= combined,
                 family = 'binomial')
summary(hurdleModel)
r.squaredGLMM(hurdleModel)
#vif(hurdleModel)
anova(hurdleModel)

[1] 326768
[1] 305317
[1] 15000
[1] 36451



Call:
glm(formula = (EffectiveComments > 0) ~ log(Additions + 1) + 
    log(Deletions + 1) + IsMerged + log(Commits + 1) + log(Assignees + 
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds + 
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart + 
    1) + FromOutsider + log(TotalReviewComments + 1) + log(GeneralComments + 
    1) + log(CommitsAfterCreate + 1) + log(AdditionsAfterCreate + 
    1) + log(DeletionsAfterCreate + 1) + IsAfter, family = "binomial", 
    data = combined)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-5.6520  -0.2013   0.0209   0.2521   2.5660  

Coefficients:
                                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)                            -3.85358    0.22795 -16.905  < 2e-16 ***
log(Additions + 1)                      0.01096    0.01579   0.694  0.48752    
log(Deletions + 1)                     -0.20555    0.01702 -12.080  < 2e-16 ***
IsMergedTrue                    

Unnamed: 0,Df,Deviance,Resid. Df,Resid. Dev
,,,36450,49384.1
log(Additions + 1),1.0,3659.706,36449,45724.39
log(Deletions + 1),1.0,245.3343,36448,45479.06
IsMerged,1.0,23.10833,36447,45455.95
log(Commits + 1),1.0,8497.836,36446,36958.11
log(Assignees + 1),1.0,166.6621,36445,36791.45
log(NumOfUniqueUsers + 1),1.0,7880.76,36444,28910.69
log(PRsOpened + 1),1.0,1.225748,36443,28909.47
log(TotalBuilds + 1),1.0,285.0947,36442,28624.37
NewContributor,1.0,65.50751,36441,28558.86


# Effective Comments

In [18]:
library(car)

sort(table(hasEffectiveComments$ProjectLanguage))

modelNumberEffectiveComments = lm(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            #log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(TotalReviewComments) +
            log(GeneralComments + 1) +
            log(CommitsAfterCreate + 1) +
            log(AdditionsAfterCreate + 1) +
            log(DeletionsAfterCreate + 1) +
            IsAfter ,
          data= hasEffectiveComments)
summary(modelNumberEffectiveComments)
#r.squaredGLMM(modelNumberEffectiveComments)
vif(modelNumberEffectiveComments)
anova(modelNumberEffectiveComments)

print("---------------------------")

modelNumberEffectiveCommentsOutsider = lmer(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            log(TotalReviewComments) +
            log(GeneralComments + 1) +
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasEffectiveCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberEffectiveCommentsOutsider)
r.squaredGLMM(modelNumberEffectiveCommentsOutsider)
vif.mer(modelNumberEffectiveCommentsOutsider)
anova(modelNumberEffectiveCommentsOutsider)


        Rust        OCaml       Perl 6       Puppet        CMake           F# 
           3            5            7            8            9           10 
         TeX      Fortran     Assembly     CartoCSS       Racket      Haskell 
          10           12           13           17           19           20 
CoffeeScript        Swift   TypeScript   Emacs Lisp        Scala        Julia 
          21           27           37           40           43           45 
      Erlang  Objective-J                   Clojure           Go            R 
          46           49           68           69           80          124 
        Perl      PLpgSQL          Nim        Shell          Lua         HTML 
         159          173          194          203          243          318 
          DM          CSS     Makefile            C           C#  Objective-C 
         353          368          506          592          769          940 
        Java         Ruby   JavaScript          PHP


Call:
lm(formula = log(EffectiveComments) ~ log(Additions + 1) + log(Deletions + 
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(NumOfUniqueUsers + 
    1) + log(PRsOpened + 1) + log(TotalBuilds + 1) + NewContributor + 
    log(PrOpenedDaysAfterProjectStart + 1) + FromOutsider + log(TotalReviewComments) + 
    log(GeneralComments + 1) + log(CommitsAfterCreate + 1) + 
    log(AdditionsAfterCreate + 1) + log(DeletionsAfterCreate + 
    1) + IsAfter, data = hasEffectiveComments)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.83915 -0.36261  0.06643  0.40924  2.07001 

Coefficients:
                                        Estimate Std. Error t value Pr(>|t|)
(Intercept)                             0.425709   0.045975   9.260  < 2e-16
log(Additions + 1)                      0.037716   0.002674  14.105  < 2e-16
log(Deletions + 1)                     -0.083544   0.002722 -30.688  < 2e-16
IsMergedTrue                            0.003693   0.010568   0.349 0.7267

Unnamed: 0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
log(Additions + 1),1,1395.706207,1395.7062073,3722.603088,0.0
log(Deletions + 1),1,206.67446,206.67446,551.238491,2.23601e-120
IsMerged,1,77.758775,77.7587751,207.396839,8.402395e-47
log(Commits + 1),1,1817.795268,1817.7952677,4848.391618,0.0
log(Assignees + 1),1,3.411114,3.411114,9.098063,0.002561802
log(NumOfUniqueUsers + 1),1,598.140105,598.1401051,1595.348785,0.0
log(PRsOpened + 1),1,42.89325,42.8932501,114.404124,1.2407399999999999e-26
log(TotalBuilds + 1),1,42.480764,42.4807642,113.303948,2.154619e-26
NewContributor,1,15.612601,15.6126008,41.641655,1.11977e-10
log(PrOpenedDaysAfterProjectStart + 1),1,59.913813,59.9138134,159.801071,1.688267e-36


[1] "---------------------------"



Correlation matrix not shown by default, as p = 14 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(EffectiveComments) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart +  
    1) + log(TotalReviewComments) + log(GeneralComments + 1) +  
    IsAfter + (1 | ProjectLanguage) + (1 | ProjectName)
   Data: hasEffectiveCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 24170.7  24297.9 -12068.3  24136.7    13133 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5959 -0.5708  0.1148  0.6480  3.1762 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.013011 0.11407 
 ProjectLanguage (Intercept) 0.002019 0.04494 
 Residual                    0.360104 0.60009 
Number of obs: 13150, groups:

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),19.39568,19.39568,1,12653.869,53.86127,2.282619e-13
log(Deletions + 1),137.4897,137.4897,1,13116.884,381.8051,0.0
log(Commits + 1),41.97047,41.97047,1,12653.045,116.5508,0.0
log(Assignees + 1),2.09376,2.09376,1,7378.245,5.814314,0.01592053
log(ChangedFiles + 1),0.00576343,0.00576343,1,13003.942,0.01600489,0.8993299
log(NumOfUniqueUsers + 1),8.853597,8.853597,1,11604.907,24.5862,7.206408e-07
log(PRsOpened + 1),2.568296,2.568296,1,11466.306,7.132088,0.007582331
log(TotalBuilds + 1),5.068526,5.068526,1,12105.311,14.07516,0.0001764683
NewContributor,0.06267494,0.06267494,1,13123.4,0.1740466,0.6765471
log(PrOpenedDaysAfterProjectStart + 1),15.35405,15.35405,1,3492.739,42.63778,7.540235e-11


# Review comments

In [19]:
modelNumberReviewComments = lmer(log(TotalReviewCommentThreads) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewComments,
          REML=FALSE)
summary(modelNumberReviewComments)
r.squaredGLMM(modelNumberReviewComments)
vif.mer(modelNumberReviewComments)
anova(modelNumberReviewComments)

determineImpact(modelNumberReviewComments, hasReviewComments, 'TotalReviewCommentThreads', 'IsAfter')

print("---------------------------")

modelNumberReviewCommentsOutsider = lmer(log(TotalReviewCommentThreads) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberReviewCommentsOutsider)
r.squaredGLMM(modelNumberReviewCommentsOutsider)
vif.mer(modelNumberReviewCommentsOutsider)
anova(modelNumberReviewCommentsOutsider)

determineImpact(modelNumberReviewCommentsOutsider, hasReviewCommentsAndFromOutsider, 'TotalReviewCommentThreads', 'IsAfter')


Correlation matrix not shown by default, as p = 14 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(TotalReviewCommentThreads) ~ log(Additions + 1) + log(Deletions +  
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart +  
    1) + FromOutsider + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewComments

     AIC      BIC   logLik deviance df.resid 
131463.6 131615.9 -65714.8 131429.6    57525 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.3119 -0.7636 -0.0563  0.7185  3.4132 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 0.0365343 0.1911  
 ProjectLanguage (Intercept) 0.0006504 0.0255  
 Residual                    0.5676426 0.7534  
Number of obs: 57542, groups:  ProjectName, 390; ProjectLanguag

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),682.870662,682.870662,1,57432.0,1202.99409,0.0
log(Deletions + 1),2.551695,2.551695,1,57524.45,4.49525,0.03399341
IsMerged,9.995319,9.995319,1,56786.48,17.60847,2.717885e-05
log(Commits + 1),559.351049,559.351049,1,57131.44,985.39305,0.0
log(Assignees + 1),34.146119,34.146119,1,50977.72,60.15426,8.881784e-15
log(ChangedFiles + 1),61.07963,61.07963,1,57257.02,107.60227,0.0
log(NumOfUniqueUsers + 1),3055.981616,3055.981616,1,56253.61,5383.63709,0.0
log(PRsOpened + 1),8.652601,8.652601,1,53960.16,15.24304,9.463909e-05
log(TotalBuilds + 1),10.629246,10.629246,1,57056.49,18.72524,1.512269e-05
NewContributor,14.363065,14.363065,1,57503.48,25.30301,4.914155e-07


  IsAfter Additions Deletions IsMerged Commits Assignees ChangedFiles
1   False        42         7    False       2         0            3
2    True        42         7    False       2         0            3
  NumOfUniqueUsers PRsOpened TotalBuilds NewContributor
1                3        27           1          False
2                3        27           1          False
  PrOpenedDaysAfterProjectStart FromOutsider ProjectLanguage
1                          1332        False                
2                          1332        False                
                     ProjectName
1 activemerchant/active_merchant
2 activemerchant/active_merchant
[1] "Order is: "
[1] "False" "True" 
[1] "Predict is:"
        1         2 
1.0740252 0.9548506 
[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(TotalReviewCommentThreads) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart +  
    1) + IsAfter + (1 | ProjectLanguage) + (1 | ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 74686.9  74812.8 -37328.5  74656.9    32425 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.2027 -0.7708 -0.0549  0.7217  3.3180 

Random effects:
 Groups          Name        Variance  Std.Dev. 
 ProjectName     (Intercept) 3.694e-02 1.922e-01
 ProjectLanguage (Intercept) 4.930e-14 2.220e-07
 Residual                    5.747e-01 7.581e-01
Number of obs: 32440, groups:  ProjectName, 389; ProjectLanguage, 42

Fixed 

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),325.2276824,325.2276824,1,32290.61,565.9208275,0.0
log(Deletions + 1),0.969383,0.969383,1,32426.68,1.6867999,0.1940326
log(Commits + 1),371.2953416,371.2953416,1,32159.61,646.0820476,0.0
log(Assignees + 1),6.2450702,6.2450702,1,26592.3,10.8668958,0.0009802671
log(ChangedFiles + 1),50.2884733,50.2884733,1,32399.45,87.5057567,0.0
log(NumOfUniqueUsers + 1),1672.5233548,1672.5233548,1,31425.61,2910.3174555,0.0
log(PRsOpened + 1),0.1844149,0.1844149,1,31153.12,0.3208959,0.5710738
log(TotalBuilds + 1),11.5942062,11.5942062,1,32034.0,20.1747979,7.09247e-06
NewContributor,4.0630138,4.0630138,1,32438.87,7.069952,0.007842583
log(PrOpenedDaysAfterProjectStart + 1),17.3814051,17.3814051,1,15088.72,30.2449628,3.869684e-08


  IsAfter Additions Deletions Commits Assignees ChangedFiles NumOfUniqueUsers
1   False        25         3       2         0            2                3
2    True        25         3       2         0            2                3
  PRsOpened TotalBuilds NewContributor PrOpenedDaysAfterProjectStart
1         6           1          False                          1320
2         6           1          False                          1320
  ProjectLanguage                    ProjectName
1                 activemerchant/active_merchant
2                 activemerchant/active_merchant
[1] "Order is: "
[1] "False" "True" 
[1] "Predict is:"
       1        2 
1.155534 1.014951 


# General comments

In [9]:
modelNumberGeneralComments = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelNumberGeneralComments)
r.squaredGLMM(modelNumberGeneralComments)
vif.mer(modelNumberGeneralComments)
anova(modelNumberGeneralComments)

print("---------------------------")

modelNumberGeneralCommentsOutsider = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberGeneralCommentsOutsider)
r.squaredGLMM(modelNumberGeneralCommentsOutsider)
vif.mer(modelNumberGeneralCommentsOutsider)
anova(modelNumberGeneralCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: hasGeneralComments

      AIC       BIC    logLik  deviance  df.resid 
 299093.9  299258.7 -149530.9  299061.9    219421 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.7692 -0.4813 -0.1368  0.4291  8.8417 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 1.525e-02 0.123498
 ProjectLanguage (Intercept) 4.941e-05 0.007029
 Residual                    2.274e-01 0.476911
Number of obs: 219437, groups:  ProjectName, 390; ProjectLanguage, 42

Fixed effects:
                       

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),17.13863,17.13863,1,219348.4,75.35329,0.0
log(Deletions + 1),3.228755,3.228755,1,219321.3,14.19584,0.0001647771
IsMerged,87.16592,87.16592,1,218364.9,383.2417,0.0
log(Commits + 1),321.0871,321.0871,1,219270.1,1411.721,0.0
log(Assignees + 1),97.70057,97.70057,1,212674.3,429.5593,0.0
log(ChangedFiles + 1),0.06599152,0.06599152,1,219260.2,0.2901444,0.5901286
log(NumOfUniqueUsers + 1),93039.19,93039.19,1,219341.8,409064.7,0.0
log(PRsOpened + 1),10.70356,10.70356,1,215510.9,47.06024,6.902257e-12
log(TotalBuilds + 1),14.95248,14.95248,1,219436.0,65.74144,4.440892e-16
NewContributor,2.556085,2.556085,1,219343.4,11.23832,0.0008013925


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasGeneralCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
189385.7 189523.9 -94678.8 189357.7   143054 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-6.0620 -0.4843 -0.1673  0.4290  9.1559 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 0.0139455 0.118091
 ProjectLanguage (Intercept) 0.0000276 0.005254
 Residual                    0.2182535 0.467176
Number of obs: 143068, groups:  ProjectName, 390; ProjectLanguage, 42

Fixed effects:
                            Estimate Std. Er

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),27.974823,27.974823,1,143006.0,128.17583,0.0
log(Deletions + 1),0.728491,0.728491,1,142988.4,3.33782,0.06770632
log(Commits + 1),217.314919,217.314919,1,142855.6,995.69959,0.0
log(Assignees + 1),52.995222,52.995222,1,136534.7,242.815,0.0
log(ChangedFiles + 1),2.701968,2.701968,1,142925.9,12.37995,0.0004341028
log(NumOfUniqueUsers + 1),66814.442229,66814.442229,1,142979.7,306132.28514,0.0
log(PRsOpened + 1),8.88165,8.88165,1,140120.1,40.69419,1.785678e-10
log(TotalBuilds + 1),4.893844,4.893844,1,143066.7,22.42275,2.189763e-06
NewContributor,2.220805,2.220805,1,143010.5,10.17535,0.001423617
IsAfter,12.069133,12.069133,1,143066.1,55.29869,1.041389e-13


In [184]:
modelShareReviewComments = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) +             
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= NumOfReviewCommentsData, 
          REML=FALSE)
summary(modelShareReviewComments)
r.squaredGLMM(modelShareReviewComments)
vif.mer(modelShareReviewComments)
anova(modelShareReviewComments)

print("---------------------------")

modelShareReviewCommentsOutsider = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelShareReviewCommentsOutsider)
r.squaredGLMM(modelShareReviewCommentsOutsider)
vif.mer(modelShareReviewCommentsOutsider)
anova(modelShareReviewCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: NumOfReviewCommentsData

      AIC       BIC    logLik  deviance  df.resid 
 732044.3  732208.8 -366006.1  732012.3    215434 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5813 -0.6857 -0.1697  0.3587  3.8576 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.114964 0.33906 
 ProjectLanguage (Intercept) 0.001188 0.03447 
 Residual                    1.744308 1.32072 
Number of obs: 215450, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
              

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),6241.479,6241.479,1,215434.3,3578.198,0.0
log(Deletions + 1),78.19538,78.19538,1,215396.5,44.82889,2.15552e-11
IsMerged,4139.119,4139.119,1,215424.0,2372.929,0.0
log(Commits + 1),3395.634,3395.634,1,215444.4,1946.694,0.0
log(Assignees + 1),276.5964,276.5964,1,208981.5,158.5709,0.0
log(ChangedFiles + 1),1633.009,1633.009,1,215433.8,936.193,0.0
log(NumOfUniqueUsers + 1),73953.63,73953.63,1,214974.7,42397.12,0.0
log(PRsOpened + 1),0.7916288,0.7916288,1,214351.4,0.4538355,0.5005199
log(TotalBuilds + 1),460.7265,460.7265,1,215430.3,264.1314,0.0
NewContributor,222.7642,222.7642,1,215445.3,127.7092,0.0


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 40615.7  40728.5 -20293.9  40587.7    23237 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5711 -0.5925  0.1944  0.7373  2.8794 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.02936  0.1714  
 ProjectLanguage (Intercept) 0.00000  0.0000  
 Residual                    0.33031  0.5747  
Number of obs: 23251, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. 

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),29.28928565,29.28928565,1,23247.96,88.6719385,0.0
log(Deletions + 1),0.06540722,0.06540722,1,23226.35,0.1980173,0.6563302
log(Commits + 1),11.80634012,11.80634012,1,23227.2,35.7431409,2.284105e-09
log(Assignees + 1),5.08837456,5.08837456,1,21696.19,15.4048153,8.703461e-05
log(ChangedFiles + 1),5.05766343,5.05766343,1,23249.35,15.3118388,9.140293e-05
log(NumOfUniqueUsers + 1),981.67127259,981.67127259,1,22769.94,2971.9637321,0.0
log(PRsOpened + 1),12.78507641,12.78507641,1,23088.92,38.7062191,5.010996e-10
log(TotalBuilds + 1),0.16004122,0.16004122,1,23223.39,0.4845173,0.4863902
NewContributor,0.60270384,0.60270384,1,23235.19,1.8246576,0.176773
IsAfter,4.46807717,4.46807717,1,23235.81,13.5268941,0.0002356992


modelBuildDiscussionComments = lmer(log(GeneralCommentsDiscussingBuild + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelBuildDiscussionComments)
r.squaredGLMM(modelBuildDiscussionComments)
vif.mer(modelBuildDiscussionComments)
anova(modelBuildDiscussionComments)

In [185]:
library(lme4)

print(sprintf("R2c of review comments is %f", r.squaredGLMM(modelNumberReviewComments)[['R2c']]))
print(sprintf("R2c of share review comments is %f", r.squaredGLMM(modelShareReviewComments)[['R2c']]))
print(sprintf("R2c of general comments is %f", r.squaredGLMM(modelNumberGeneralComments)[['R2c']]))



[1] "R2c of review comments is 0.342312"
[1] "R2c of share review comments is 0.308417"
[1] "R2c of general comments is 0.717337"


# Using effective comments to make predictions

In [104]:
projectNames <- sqldf("select distinct(ProjectName) as ProjectName
                    from hasEffectiveComments")$ProjectName


result <- data.frame(Name = '', Actual = 0, Predicted = 0)[0,]

for(project in projectNames) {
    projectData <- sqldf(sprintf("select * from hasEffectiveComments
                            where ProjectName = '%s'", project))
    

    
    medianRow <- data.frame(Additions = median(projectData$Additions),
                           Deletions = median(projectData$Deletions),
                           IsMerged = 'True',
                           Commits = median(projectData$Commits),
                           Assignees = median(projectData$Assignees),
                           #ChangedFiles = median(projectData$ChangedFiles),
                           NumOfUniqueUsers = median(projectData$NumOfUniqueUsers),
                           PRsOpened = median(projectData$PRsOpened),
                           TotalBuilds = median(projectData$TotalBuilds),
                           NewContributor = 'False',
                           FromOutsider = 'False',
                           ReviewComments = median(projectData$ReviewComments),
                           GeneralComments = median(projectData$GeneralComments),
                           IsAfter = 'True')[1,]
    
       
    
    predicted <- predict(modelNumberEffectiveComments, medianRow)
    
    result<-rbind(result, data.frame(Name = project, Actual = median(projectData$EffectiveComments), Predicted = predicted))
}

In [105]:
summary(result)

head(result)

sorted <- result[order(abs(result$Actual - result$Predicted)),]

head(sorted)

tail(sorted)


                          Name         Actual        Predicted      
 Leaflet/Leaflet            :  1   Min.   :1.000   Min.   :-0.1797  
 MarkUsProject/Markus       :  1   1st Qu.:1.000   1st Qu.: 0.5275  
 MightyPirates/OpenComputers:  1   Median :2.000   Median : 0.7478  
 NancyFx/Nancy              :  1   Mean   :1.938   Mean   : 0.7269  
 OP2/PyOP2                  :  1   3rd Qu.:2.000   3rd Qu.: 0.9221  
 OPM/opm-core               :  1   Max.   :6.000   Max.   : 1.5877  
 (Other)                    :236                                    

Unnamed: 0,Name,Actual,Predicted
1,Leaflet/Leaflet,1,0.2811769
11,MarkUsProject/Markus,2,0.9227097
12,MightyPirates/OpenComputers,2,0.9074532
13,NancyFx/Nancy,2,0.8333288
14,OP2/PyOP2,3,1.2019746
15,OPM/opm-core,2,0.9335412


Unnamed: 0,Name,Actual,Predicted
1207,melpa/melpa,1,1.0181165
123,ImageEngine/cortex,1,1.0362102
1113,mozilla/shumway,1,0.9574169
1209,madrobby/zepto,1,0.9222987
146,Zarel/Pokemon-Showdown,1,0.9146175
1102,geotools/geotools,1,0.9131079


Unnamed: 0,Name,Actual,Predicted
1178,rapid7/metasploit-framework,4.0,1.0794994
1216,h5bp/html5-boilerplate,4.5,0.8851724
1212,statsmodels/statsmodels,5.0,1.3349541
1223,activemerchant/active_merchant,5.0,1.0116392
153,bem/bem-components,5.0,0.8692513
138,MariaDB/server,6.0,0.8977185


# Finding individual projects which are impacted by Travis

In [186]:
#summary(NumOfReviewCommentsData)

projectNames <- sqldf("select ProjectName, count(ProjectName) 
                    from NumOfReviewCommentsData
                    group by ProjectName 
                    having Count(ProjectName) > 200")$ProjectName

for(project in projectNames) {
    formula <- log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            IsAfter
    
    data <- sqldf(sprintf("select * from NumOfReviewCommentsData
                    where ProjectName ='%s'
                        and ReviewComments > 1", project))
    
    data$IsMerged <- factor(data$IsMerged)
    data$NewContributor <- factor(data$NewContributor)
    data$FromOutsider <- factor(data$FromOutsider)
    data$IsAfter <- factor(data$IsAfter)
    
    #Skip running an analysis if there is not enough data
    if(nlevels(data$IsMerged) > 1 &
      nlevels(data$NewContributor) > 1 &
      nlevels(data$FromOutsider) > 1 &
      nlevels(data$IsAfter) > 1)
    {
        model_per_project <- lm(formula, data=data)
        
        pOfIsAfter <- tail(summary(model_per_project)$coefficients)[,4]   ##P-values 
        rsq <- summary(model_per_project)$r.squared   
        
        #print(summary(model_per_project))
        
        print(sprintf("Name: %s pval: %f rsq: %f", project, pOfIsAfter, rsq))
    }
    
}

[1] "Name: AFNetworking/AFNetworking pval: 0.136075 rsq: 0.947911"
[2] "Name: AFNetworking/AFNetworking pval: 0.147730 rsq: 0.947911"
[3] "Name: AFNetworking/AFNetworking pval: 0.719298 rsq: 0.947911"
[4] "Name: AFNetworking/AFNetworking pval: 0.094202 rsq: 0.947911"
[5] "Name: AFNetworking/AFNetworking pval: 0.158648 rsq: 0.947911"
[6] "Name: AFNetworking/AFNetworking pval: 0.160747 rsq: 0.947911"
[1] "Name: AnalyticalGraphicsInc/cesium pval: 0.000001 rsq: 0.508137"
[2] "Name: AnalyticalGraphicsInc/cesium pval: 0.058781 rsq: 0.508137"
[3] "Name: AnalyticalGraphicsInc/cesium pval: 0.298592 rsq: 0.508137"
[4] "Name: AnalyticalGraphicsInc/cesium pval: 0.463259 rsq: 0.508137"
[5] "Name: AnalyticalGraphicsInc/cesium pval: 0.000007 rsq: 0.508137"
[6] "Name: AnalyticalGraphicsInc/cesium pval: 0.525550 rsq: 0.508137"
[1] "Name: Automattic/_s pval: 0.240493 rsq: 0.582836"
[2] "Name: Automattic/_s pval: 0.542499 rsq: 0.582836"
[3] "Name: Automattic/_s pval: 0.004741 rsq: 0.582836"
[4] "Name: Au