Contains the R models used to analyze the number of review comments before and after the introduction of Travis CI. Looks at whether a boolean variable `IsAfterTravisIntroduction` can be used to predict the number of review comments under a pull request. 

In [60]:
filename <- 'generated/num_of_review_comments.csv'

NumOfReviewCommentsData <- read.csv(file=filename, header=TRUE, sep=",")

summary(NumOfReviewCommentsData)

 EffectiveComments  ReviewComments    ShareReviewComments GeneralComments   
 Min.   :  0.0000   Min.   :  0.000   Min.   :  0.00      Min.   :   0.000  
 1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.:  0.00      1st Qu.:   0.000  
 Median :  0.0000   Median :  0.000   Median :  0.00      Median :   1.000  
 Mean   :  0.2612   Mean   :  1.318   Mean   : 10.73      Mean   :   2.757  
 3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.:  0.00      3rd Qu.:   3.000  
 Max.   :486.0000   Max.   :494.000   Max.   :100.00      Max.   :1035.000  
                                                                            
 GeneralCommentsDiscussingBuild   Additions         Deletions        
 Min.   :0                      Min.   :      0   Min.   :      0.0  
 1st Qu.:0                      1st Qu.:      1   1st Qu.:      0.0  
 Median :0                      Median :      8   Median :      2.0  
 Mean   :0                      Mean   :   1425   Mean   :    449.1  
 3rd Qu.:0                      3r

In [61]:
library(lmerTest)
library(MuMIn)
library(VIF)
library(sqldf)

vif.mer <- function (fit) {
    ## adapted from rms::vif
    
    v <- vcov(fit)
    nam <- names(fixef(fit))

    ## exclude intercepts
    ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
    if (ns > 0) {
        v <- v[-(1:ns), -(1:ns), drop = FALSE]
        nam <- nam[-(1:ns)]
    }
    
    d <- diag(v)^0.5
    v <- diag(solve(v/(d %o% d)))
    names(v) <- nam
    v
}

In [82]:
hasReviewComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where ReviewComments > 0")

hasReviewCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where ReviewComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where ReviewComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasReviewCommentsAndFromOutsider$ProjectName <- factor(hasReviewCommentsAndFromOutsider$ProjectName)
hasReviewCommentsAndFromOutsider$ProjectLanguage <- factor(hasReviewCommentsAndFromOutsider$ProjectLanguage)
hasReviewCommentsAndFromOutsider$IsMerged <- factor(hasReviewCommentsAndFromOutsider$IsMerged)

hasGeneralComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0")

hasGeneralCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where GeneralComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasGeneralCommentsAndFromOutsider$ProjectName <- factor(hasGeneralCommentsAndFromOutsider$ProjectName)
hasGeneralCommentsAndFromOutsider$ProjectLanguage <- factor(hasGeneralCommentsAndFromOutsider$ProjectLanguage)
hasGeneralCommentsAndFromOutsider$IsMerged <- factor(hasGeneralCommentsAndFromOutsider$IsMerged)

summary(hasGeneralCommentsAndFromOutsider)

hasEffectiveComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0")

print(nrow(hasEffectiveComments))

hasEffectiveCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where EffectiveComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasEffectiveCommentsAndFromOutsider$ProjectName <- factor(hasEffectiveCommentsAndFromOutsider$ProjectName)
hasEffectiveCommentsAndFromOutsider$ProjectLanguage <- factor(hasEffectiveCommentsAndFromOutsider$ProjectLanguage)
hasEffectiveCommentsAndFromOutsider$IsMerged <- factor(hasEffectiveCommentsAndFromOutsider$IsMerged)

summary(hasEffectiveCommentsAndFromOutsider)

 EffectiveComments  ReviewComments    ShareReviewComments GeneralComments   
 Min.   :  0.0000   Min.   :  0.000   Min.   : 0.00       Min.   :   1.000  
 1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.: 0.00       1st Qu.:   1.000  
 Median :  0.0000   Median :  0.000   Median : 0.00       Median :   2.000  
 Mean   :  0.3629   Mean   :  1.611   Mean   :10.23       Mean   :   4.317  
 3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.: 0.00       3rd Qu.:   5.000  
 Max.   :486.0000   Max.   :494.000   Max.   :99.72       Max.   :1035.000  
                                                                            
 GeneralCommentsDiscussingBuild   Additions         Deletions      
 Min.   :0                      Min.   :      0   Min.   :      0  
 1st Qu.:0                      1st Qu.:      0   1st Qu.:      0  
 Median :0                      Median :      5   Median :      1  
 Mean   :0                      Mean   :    809   Mean   :    337  
 3rd Qu.:0                      3rd Qu.:    

[1] 15258


 EffectiveComments ReviewComments   ShareReviewComments GeneralComments  
 Min.   :  1.000   Min.   :  1.00   Min.   :  1.19      Min.   :  0.000  
 1st Qu.:  1.000   1st Qu.:  2.00   1st Qu.: 33.33      1st Qu.:  2.000  
 Median :  2.000   Median :  5.00   Median : 56.00      Median :  4.000  
 Mean   :  4.534   Mean   : 10.66   Mean   : 56.46      Mean   :  8.055  
 3rd Qu.:  4.000   3rd Qu.: 11.00   3rd Qu.: 78.26      3rd Qu.:  9.000  
 Max.   :486.000   Max.   :494.00   Max.   :100.00      Max.   :279.000  
                                                                         
 GeneralCommentsDiscussingBuild   Additions           Deletions       
 Min.   :0                      Min.   :      0.0   Min.   :     0.0  
 1st Qu.:0                      1st Qu.:      4.0   1st Qu.:     0.0  
 Median :0                      Median :     44.0   Median :     4.0  
 Mean   :0                      Mean   :    562.7   Mean   :   195.9  
 3rd Qu.:0                      3rd Qu.:    204.8   3

In [98]:
zerosdf <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments = 0")
print(nrow(NumOfReviewCommentsData))
print(nrow(zerosdf))

sampledZerosdf <- zerosdf[sample(nrow(zerosdf), 15000),]

print(nrow(sampledZerosdf))

combined <- rbind(sampledZerosdf, hasEffectiveComments)

print(nrow(combined))

hurdleModel = glm((EffectiveComments > 0) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            #log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(ReviewComments + 1) +
            log(GeneralComments + 1) +
            IsAfter ,
          data= combined,
                 family = 'binomial')
summary(hurdleModel)
r.squaredGLMM(hurdleModel)
vif(hurdleModel)
anova(hurdleModel)

[1] 256450
[1] 241192
[1] 15000
[1] 30258



Call:
glm(formula = (EffectiveComments > 0) ~ log(Additions + 1) + 
    log(Deletions + 1) + IsMerged + log(Commits + 1) + log(Assignees + 
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds + 
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart + 
    1) + FromOutsider + log(ReviewComments + 1) + log(GeneralComments + 
    1) + IsAfter, family = "binomial", data = combined)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-4.8587  -0.3255   0.0046   0.3223   2.2904  

Coefficients:
                                        Estimate Std. Error z value Pr(>|z|)
(Intercept)                            -5.306251   0.218157 -24.323  < 2e-16
log(Additions + 1)                     -0.001668   0.014193  -0.117 0.906470
log(Deletions + 1)                     -0.196244   0.015398 -12.745  < 2e-16
IsMergedTrue                            0.699702   0.053077  13.183  < 2e-16
log(Commits + 1)                        0.937740   0.034720  27.008  < 2e-16

Unnamed: 0,Df,Deviance,Resid. Df,Resid. Dev
,,,30257,41944.29
log(Additions + 1),1.0,3383.807,30256,38560.49
log(Deletions + 1),1.0,199.6883,30255,38360.8
IsMerged,1.0,23.29093,30254,38337.51
log(Commits + 1),1.0,6560.706,30253,31776.8
log(Assignees + 1),1.0,147.3895,30252,31629.41
log(NumOfUniqueUsers + 1),1.0,6459.397,30251,25170.02
log(PRsOpened + 1),1.0,0.09666847,30250,25169.92
log(TotalBuilds + 1),1.0,419.8145,30249,24750.1
NewContributor,1.0,46.51387,30248,24703.59


# Effective Comments

In [103]:
library(car)

sort(table(hasEffectiveComments$ProjectLanguage))

modelNumberEffectiveComments = lm(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            #log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(ReviewComments) +
            log(GeneralComments + 1) +
            IsAfter ,
          data= hasEffectiveComments)
summary(modelNumberEffectiveComments)
r.squaredGLMM(modelNumberEffectiveComments)
vif(modelNumberEffectiveComments)
anova(modelNumberEffectiveComments)

print("---------------------------")

modelNumberEffectiveCommentsOutsider = lmer(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            log(ReviewComments) +
            log(GeneralComments + 1) +
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasEffectiveCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberEffectiveCommentsOutsider)
r.squaredGLMM(modelNumberEffectiveCommentsOutsider)
vif.mer(modelNumberEffectiveCommentsOutsider)
anova(modelNumberEffectiveCommentsOutsider)


       OCaml         Rust       Perl 6           F#     CartoCSS     Assembly 
           2            2            4           11           12           15 
CoffeeScript        Swift      Haskell   TypeScript   Emacs Lisp  Objective-J 
          15           15           20           22           34           35 
       Scala       Erlang      Clojure        Julia                        Go 
          39           42           46           51           55           65 
       Shell            R      PLpgSQL         HTML          Lua          CSS 
          99          118          137          157          163          180 
         Nim            C           DM     Makefile           C#  Objective-C 
         187          297          313          405          631          808 
        Ruby         Java   JavaScript          PHP          C++       Python 
        1474         1597         1728         1749         1828         2902 


Call:
lm(formula = log(EffectiveComments) ~ log(Additions + 1) + log(Deletions + 
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(NumOfUniqueUsers + 
    1) + log(PRsOpened + 1) + log(TotalBuilds + 1) + NewContributor + 
    FromOutsider + log(ReviewComments) + log(GeneralComments + 
    1) + IsAfter, data = hasEffectiveComments)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.69775 -0.38598  0.06347  0.41835  2.43741 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)                0.069831   0.034796   2.007 0.044784 *  
log(Additions + 1)         0.032407   0.003254   9.958  < 2e-16 ***
log(Deletions + 1)        -0.072951   0.003258 -22.391  < 2e-16 ***
IsMergedTrue               0.003135   0.013382   0.234 0.814752    
log(Commits + 1)           0.088844   0.008422  10.549  < 2e-16 ***
log(Assignees + 1)        -0.079865   0.021488  -3.717 0.000203 ***
log(NumOfUniqueUsers + 1) -0.142374   0.020032  -7.107 1

Unnamed: 0,Df,Sum Sq,Mean Sq,F value,Pr(>F)
log(Additions + 1),1,1007.9161566,1007.9161566,2474.435,0.0
log(Deletions + 1),1,148.5431484,148.5431484,364.6735,2.341149e-80
IsMerged,1,58.3448651,58.3448651,143.2367,7.323639000000001e-33
log(Commits + 1),1,1321.9415056,1321.9415056,3245.367,0.0
log(Assignees + 1),1,1.5152497,1.5152497,3.719939,0.05378488
log(NumOfUniqueUsers + 1),1,446.3028091,446.3028091,1095.674,4.32382e-232
log(PRsOpened + 1),1,30.6947687,30.6947687,75.35567,4.324122e-18
log(TotalBuilds + 1),1,12.2292757,12.2292757,30.02288,4.337501e-08
NewContributor,1,11.9928022,11.9928022,29.44234,5.848477e-08
FromOutsider,1,0.3123393,0.3123393,0.7667932,0.3812247


[1] "---------------------------"



Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(EffectiveComments) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + log(ReviewComments) + log(GeneralComments +  
    1) + IsAfter + (1 | ProjectLanguage) + (1 | ProjectName)
   Data: hasEffectiveCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 17333.1  17447.0  -8650.6  17301.1     9062 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.3240 -0.5857  0.0998  0.6451  3.8082 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.010241 0.10120 
 ProjectLanguage (Intercept) 0.003201 0.05657 
 Residual                    0.387519 0.62251 
Number of obs: 9078, groups:  ProjectName, 234; ProjectLanguage, 36

Fixed effect

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),7.6435199,7.6435199,1,8679.138,19.7242453,9.056668e-06
log(Deletions + 1),69.6020264,69.6020264,1,9059.964,179.6093246,0.0
log(Commits + 1),34.9575048,34.9575048,1,8763.533,90.2084919,0.0
log(Assignees + 1),1.3753288,1.3753288,1,5229.466,3.5490616,0.05963457
log(ChangedFiles + 1),0.1850652,0.1850652,1,8860.412,0.4775641,0.4895459
log(NumOfUniqueUsers + 1),5.7579815,5.7579815,1,7997.347,14.8585784,0.0001167874
log(PRsOpened + 1),2.4841812,2.4841812,1,7804.157,6.4104758,0.01136435
log(TotalBuilds + 1),6.5782618,6.5782618,1,8196.362,16.9753269,3.824135e-05
NewContributor,0.2028557,0.2028557,1,9045.624,0.5234729,0.4693827
log(ReviewComments),2429.5333109,2429.5333109,1,8843.367,6269.455932,0.0


# Review comments

In [100]:
modelNumberReviewComments = lmer(log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewComments,
          REML=FALSE)
summary(modelNumberReviewComments)
r.squaredGLMM(modelNumberReviewComments)
vif.mer(modelNumberReviewComments)
anova(modelNumberReviewComments)

print("---------------------------")

modelNumberReviewCommentsOutsider = lmer(log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberReviewCommentsOutsider)
r.squaredGLMM(modelNumberReviewCommentsOutsider)
vif.mer(modelNumberReviewCommentsOutsider)
anova(modelNumberReviewCommentsOutsider)


Correlation matrix not shown by default, as p = 14 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ReviewComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + log(PrOpenedDaysAfterProjectStart +  
    1) + FromOutsider + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewComments

     AIC      BIC   logLik deviance df.resid 
121796.9 121946.4 -60881.5 121762.9    48476 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.0340 -0.6816 -0.0343  0.6562  5.2623 

Random effects:
 Groups          Name        Variance  Std.Dev. 
 ProjectName     (Intercept) 4.631e-02 2.152e-01
 ProjectLanguage (Intercept) 1.723e-15 4.151e-08
 Residual                    7.137e-01 8.448e-01
Number of obs: 48493, groups:  ProjectName, 246; ProjectLanguage, 36



Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),776.496796,776.496796,1,48463.43,1088.0245532,0.0
log(Deletions + 1),0.4133617,0.4133617,1,48482.85,0.579201,0.4466294
IsMerged,51.4748587,51.4748587,1,48276.45,72.1263892,0.0
log(Commits + 1),1199.6049123,1199.6049123,1,48283.79,1680.8821432,0.0
log(Assignees + 1),35.9914336,35.9914336,1,43607.28,50.4310689,1.25322e-12
log(ChangedFiles + 1),29.6812082,29.6812082,1,48481.54,41.5892035,1.1367e-10
log(NumOfUniqueUsers + 1),5209.2536948,5209.2536948,1,47355.47,7299.1877786,0.0
log(PRsOpened + 1),16.2965568,16.2965568,1,47001.2,22.8346776,1.770837e-06
log(TotalBuilds + 1),10.566329,10.566329,1,48223.43,14.8055027,0.0001193406
NewContributor,27.5703347,27.5703347,1,48475.86,38.6314551,5.16029e-10


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ReviewComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 66261.4  66375.8 -33116.7  66233.4    26096 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.7966 -0.6852 -0.0387  0.6588  5.3170 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 0.0456266 0.21360 
 ProjectLanguage (Intercept) 0.0003739 0.01934 
 Residual                    0.7288711 0.85374 
Number of obs: 26110, groups:  ProjectName, 246; ProjectLanguage, 36

Fixed effects:
                            Estimate Std. Error

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),311.2255,311.2255,1,26056.7,426.9966,0.0
log(Deletions + 1),0.003826523,0.003826523,1,26098.21,0.005249931,0.9422393
log(Commits + 1),763.9101,763.9101,1,25848.14,1048.073,0.0
log(Assignees + 1),8.086117,8.086117,1,22584.76,11.09403,0.0008674497
log(ChangedFiles + 1),17.83218,17.83218,1,26075.08,24.46547,7.612374e-07
log(NumOfUniqueUsers + 1),2769.209,2769.209,1,25147.87,3799.313,0.0
log(PRsOpened + 1),0.5139991,0.5139991,1,25481.16,0.7051988,0.4010502
log(TotalBuilds + 1),10.94535,10.94535,1,25896.61,15.01686,0.0001068175
NewContributor,10.14166,10.14166,1,26101.37,13.9142,0.0001917527
IsAfter,1.491451,1.491451,1,25991.03,2.046248,0.1525938


# General comments

In [183]:
modelNumberGeneralComments = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelNumberGeneralComments)
r.squaredGLMM(modelNumberGeneralComments)
vif.mer(modelNumberGeneralComments)
anova(modelNumberGeneralComments)

print("---------------------------")

modelNumberGeneralCommentsOutsider = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberGeneralCommentsOutsider)
r.squaredGLMM(modelNumberGeneralCommentsOutsider)
vif.mer(modelNumberGeneralCommentsOutsider)
anova(modelNumberGeneralCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: hasGeneralComments

      AIC       BIC    logLik  deviance  df.resid 
 200694.8  200853.1 -100331.4  200662.8    145674 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.6522 -0.4763 -0.1126  0.4258  8.7113 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 0.0186657 0.1366  
 ProjectLanguage (Intercept) 0.0002075 0.0144  
 Residual                    0.2310259 0.4807  
Number of obs: 145690, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                       

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),8.333609,8.333609,1,145663.7,36.07218,1.905904e-09
log(Deletions + 1),3.734047,3.734047,1,145624.8,16.16289,5.815091e-05
IsMerged,63.25648,63.25648,1,145636.6,273.8068,0.0
log(Commits + 1),258.9738,258.9738,1,145681.0,1120.973,0.0
log(Assignees + 1),70.31775,70.31775,1,142627.6,304.3717,0.0
log(ChangedFiles + 1),0.05017641,0.05017641,1,145660.8,0.2171895,0.6411905
log(NumOfUniqueUsers + 1),60823.3,60823.3,1,145603.9,263274.8,0.0
log(PRsOpened + 1),2.973846,2.973846,1,145206.8,12.87235,0.0003335825
log(TotalBuilds + 1),6.88276,6.88276,1,145675.9,29.79215,4.817162e-08
NewContributor,1.861738,1.861738,1,145685.8,8.058566,0.004529527


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasGeneralCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
118436.0 118567.4 -59204.0 118408.0    87946 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.8009 -0.4806 -0.1496  0.4307  8.9863 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.015741 0.12546 
 ProjectLanguage (Intercept) 0.000105 0.01025 
 Residual                    0.223532 0.47279 
Number of obs: 87960, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. Error  

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),14.9604,14.9604,1,87952.63,66.92749,4.440892e-16
log(Deletions + 1),0.400663,0.400663,1,87931.03,1.792423,0.1806347
log(Commits + 1),165.8048,165.8048,1,87959.71,741.7514,0.0
log(Assignees + 1),37.06881,37.06881,1,84720.7,165.8326,0.0
log(ChangedFiles + 1),0.8447748,0.8447748,1,87944.35,3.77922,0.05189603
log(NumOfUniqueUsers + 1),41413.04,41413.04,1,87835.93,185267.1,0.0
log(PRsOpened + 1),3.215397,3.215397,1,87439.47,14.38454,0.0001491213
log(TotalBuilds + 1),1.601488,1.601488,1,87953.77,7.164482,0.007437494
NewContributor,1.508562,1.508562,1,87937.51,6.748767,0.009382812
IsAfter,14.60691,14.60691,1,87953.14,65.34608,6.661338e-16


In [184]:
modelShareReviewComments = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) +             
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= NumOfReviewCommentsData, 
          REML=FALSE)
summary(modelShareReviewComments)
r.squaredGLMM(modelShareReviewComments)
vif.mer(modelShareReviewComments)
anova(modelShareReviewComments)

print("---------------------------")

modelShareReviewCommentsOutsider = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelShareReviewCommentsOutsider)
r.squaredGLMM(modelShareReviewCommentsOutsider)
vif.mer(modelShareReviewCommentsOutsider)
anova(modelShareReviewCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: NumOfReviewCommentsData

      AIC       BIC    logLik  deviance  df.resid 
 732044.3  732208.8 -366006.1  732012.3    215434 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5813 -0.6857 -0.1697  0.3587  3.8576 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.114964 0.33906 
 ProjectLanguage (Intercept) 0.001188 0.03447 
 Residual                    1.744308 1.32072 
Number of obs: 215450, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
              

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),6241.479,6241.479,1,215434.3,3578.198,0.0
log(Deletions + 1),78.19538,78.19538,1,215396.5,44.82889,2.15552e-11
IsMerged,4139.119,4139.119,1,215424.0,2372.929,0.0
log(Commits + 1),3395.634,3395.634,1,215444.4,1946.694,0.0
log(Assignees + 1),276.5964,276.5964,1,208981.5,158.5709,0.0
log(ChangedFiles + 1),1633.009,1633.009,1,215433.8,936.193,0.0
log(NumOfUniqueUsers + 1),73953.63,73953.63,1,214974.7,42397.12,0.0
log(PRsOpened + 1),0.7916288,0.7916288,1,214351.4,0.4538355,0.5005199
log(TotalBuilds + 1),460.7265,460.7265,1,215430.3,264.1314,0.0
NewContributor,222.7642,222.7642,1,215445.3,127.7092,0.0


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 40615.7  40728.5 -20293.9  40587.7    23237 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5711 -0.5925  0.1944  0.7373  2.8794 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.02936  0.1714  
 ProjectLanguage (Intercept) 0.00000  0.0000  
 Residual                    0.33031  0.5747  
Number of obs: 23251, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. 

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),29.28928565,29.28928565,1,23247.96,88.6719385,0.0
log(Deletions + 1),0.06540722,0.06540722,1,23226.35,0.1980173,0.6563302
log(Commits + 1),11.80634012,11.80634012,1,23227.2,35.7431409,2.284105e-09
log(Assignees + 1),5.08837456,5.08837456,1,21696.19,15.4048153,8.703461e-05
log(ChangedFiles + 1),5.05766343,5.05766343,1,23249.35,15.3118388,9.140293e-05
log(NumOfUniqueUsers + 1),981.67127259,981.67127259,1,22769.94,2971.9637321,0.0
log(PRsOpened + 1),12.78507641,12.78507641,1,23088.92,38.7062191,5.010996e-10
log(TotalBuilds + 1),0.16004122,0.16004122,1,23223.39,0.4845173,0.4863902
NewContributor,0.60270384,0.60270384,1,23235.19,1.8246576,0.176773
IsAfter,4.46807717,4.46807717,1,23235.81,13.5268941,0.0002356992


modelBuildDiscussionComments = lmer(log(GeneralCommentsDiscussingBuild + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelBuildDiscussionComments)
r.squaredGLMM(modelBuildDiscussionComments)
vif.mer(modelBuildDiscussionComments)
anova(modelBuildDiscussionComments)

In [185]:
library(lme4)

print(sprintf("R2c of review comments is %f", r.squaredGLMM(modelNumberReviewComments)[['R2c']]))
print(sprintf("R2c of share review comments is %f", r.squaredGLMM(modelShareReviewComments)[['R2c']]))
print(sprintf("R2c of general comments is %f", r.squaredGLMM(modelNumberGeneralComments)[['R2c']]))



[1] "R2c of review comments is 0.342312"
[1] "R2c of share review comments is 0.308417"
[1] "R2c of general comments is 0.717337"


# Using effective comments to make predictions

In [104]:
projectNames <- sqldf("select distinct(ProjectName) as ProjectName
                    from hasEffectiveComments")$ProjectName


result <- data.frame(Name = '', Actual = 0, Predicted = 0)[0,]

for(project in projectNames) {
    projectData <- sqldf(sprintf("select * from hasEffectiveComments
                            where ProjectName = '%s'", project))
    

    
    medianRow <- data.frame(Additions = median(projectData$Additions),
                           Deletions = median(projectData$Deletions),
                           IsMerged = 'True',
                           Commits = median(projectData$Commits),
                           Assignees = median(projectData$Assignees),
                           #ChangedFiles = median(projectData$ChangedFiles),
                           NumOfUniqueUsers = median(projectData$NumOfUniqueUsers),
                           PRsOpened = median(projectData$PRsOpened),
                           TotalBuilds = median(projectData$TotalBuilds),
                           NewContributor = 'False',
                           FromOutsider = 'False',
                           ReviewComments = median(projectData$ReviewComments),
                           GeneralComments = median(projectData$GeneralComments),
                           IsAfter = 'True')[1,]
    
       
    
    predicted <- predict(modelNumberEffectiveComments, medianRow)
    
    result<-rbind(result, data.frame(Name = project, Actual = median(projectData$EffectiveComments), Predicted = predicted))
}

In [105]:
summary(result)

head(result)

sorted <- result[order(abs(result$Actual - result$Predicted)),]

head(sorted)

tail(sorted)


                          Name         Actual        Predicted      
 Leaflet/Leaflet            :  1   Min.   :1.000   Min.   :-0.1797  
 MarkUsProject/Markus       :  1   1st Qu.:1.000   1st Qu.: 0.5275  
 MightyPirates/OpenComputers:  1   Median :2.000   Median : 0.7478  
 NancyFx/Nancy              :  1   Mean   :1.938   Mean   : 0.7269  
 OP2/PyOP2                  :  1   3rd Qu.:2.000   3rd Qu.: 0.9221  
 OPM/opm-core               :  1   Max.   :6.000   Max.   : 1.5877  
 (Other)                    :236                                    

Unnamed: 0,Name,Actual,Predicted
1,Leaflet/Leaflet,1,0.2811769
11,MarkUsProject/Markus,2,0.9227097
12,MightyPirates/OpenComputers,2,0.9074532
13,NancyFx/Nancy,2,0.8333288
14,OP2/PyOP2,3,1.2019746
15,OPM/opm-core,2,0.9335412


Unnamed: 0,Name,Actual,Predicted
1207,melpa/melpa,1,1.0181165
123,ImageEngine/cortex,1,1.0362102
1113,mozilla/shumway,1,0.9574169
1209,madrobby/zepto,1,0.9222987
146,Zarel/Pokemon-Showdown,1,0.9146175
1102,geotools/geotools,1,0.9131079


Unnamed: 0,Name,Actual,Predicted
1178,rapid7/metasploit-framework,4.0,1.0794994
1216,h5bp/html5-boilerplate,4.5,0.8851724
1212,statsmodels/statsmodels,5.0,1.3349541
1223,activemerchant/active_merchant,5.0,1.0116392
153,bem/bem-components,5.0,0.8692513
138,MariaDB/server,6.0,0.8977185


# Finding individual projects which are impacted by Travis

In [186]:
#summary(NumOfReviewCommentsData)

projectNames <- sqldf("select ProjectName, count(ProjectName) 
                    from NumOfReviewCommentsData
                    group by ProjectName 
                    having Count(ProjectName) > 200")$ProjectName

for(project in projectNames) {
    formula <- log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            IsAfter
    
    data <- sqldf(sprintf("select * from NumOfReviewCommentsData
                    where ProjectName ='%s'
                        and ReviewComments > 1", project))
    
    data$IsMerged <- factor(data$IsMerged)
    data$NewContributor <- factor(data$NewContributor)
    data$FromOutsider <- factor(data$FromOutsider)
    data$IsAfter <- factor(data$IsAfter)
    
    #Skip running an analysis if there is not enough data
    if(nlevels(data$IsMerged) > 1 &
      nlevels(data$NewContributor) > 1 &
      nlevels(data$FromOutsider) > 1 &
      nlevels(data$IsAfter) > 1)
    {
        model_per_project <- lm(formula, data=data)
        
        pOfIsAfter <- tail(summary(model_per_project)$coefficients)[,4]   ##P-values 
        rsq <- summary(model_per_project)$r.squared   
        
        #print(summary(model_per_project))
        
        print(sprintf("Name: %s pval: %f rsq: %f", project, pOfIsAfter, rsq))
    }
    
}

[1] "Name: AFNetworking/AFNetworking pval: 0.136075 rsq: 0.947911"
[2] "Name: AFNetworking/AFNetworking pval: 0.147730 rsq: 0.947911"
[3] "Name: AFNetworking/AFNetworking pval: 0.719298 rsq: 0.947911"
[4] "Name: AFNetworking/AFNetworking pval: 0.094202 rsq: 0.947911"
[5] "Name: AFNetworking/AFNetworking pval: 0.158648 rsq: 0.947911"
[6] "Name: AFNetworking/AFNetworking pval: 0.160747 rsq: 0.947911"
[1] "Name: AnalyticalGraphicsInc/cesium pval: 0.000001 rsq: 0.508137"
[2] "Name: AnalyticalGraphicsInc/cesium pval: 0.058781 rsq: 0.508137"
[3] "Name: AnalyticalGraphicsInc/cesium pval: 0.298592 rsq: 0.508137"
[4] "Name: AnalyticalGraphicsInc/cesium pval: 0.463259 rsq: 0.508137"
[5] "Name: AnalyticalGraphicsInc/cesium pval: 0.000007 rsq: 0.508137"
[6] "Name: AnalyticalGraphicsInc/cesium pval: 0.525550 rsq: 0.508137"
[1] "Name: Automattic/_s pval: 0.240493 rsq: 0.582836"
[2] "Name: Automattic/_s pval: 0.542499 rsq: 0.582836"
[3] "Name: Automattic/_s pval: 0.004741 rsq: 0.582836"
[4] "Name: Au