Contains the R models used to analyze the number of review comments before and after the introduction of Travis CI. Looks at whether a boolean variable `IsAfterTravisIntroduction` can be used to predict the number of review comments under a pull request. 

In [60]:
filename <- 'generated/num_of_review_comments.csv'

NumOfReviewCommentsData <- read.csv(file=filename, header=TRUE, sep=",")

summary(NumOfReviewCommentsData)

 EffectiveComments  ReviewComments    ShareReviewComments GeneralComments   
 Min.   :  0.0000   Min.   :  0.000   Min.   :  0.00      Min.   :   0.000  
 1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.:  0.00      1st Qu.:   0.000  
 Median :  0.0000   Median :  0.000   Median :  0.00      Median :   1.000  
 Mean   :  0.2612   Mean   :  1.318   Mean   : 10.73      Mean   :   2.757  
 3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.:  0.00      3rd Qu.:   3.000  
 Max.   :486.0000   Max.   :494.000   Max.   :100.00      Max.   :1035.000  
                                                                            
 GeneralCommentsDiscussingBuild   Additions         Deletions        
 Min.   :0                      Min.   :      0   Min.   :      0.0  
 1st Qu.:0                      1st Qu.:      1   1st Qu.:      0.0  
 Median :0                      Median :      8   Median :      2.0  
 Mean   :0                      Mean   :   1425   Mean   :    449.1  
 3rd Qu.:0                      3r

In [61]:
library(lmerTest)
library(MuMIn)
library(VIF)
library(sqldf)

vif.mer <- function (fit) {
    ## adapted from rms::vif
    
    v <- vcov(fit)
    nam <- names(fixef(fit))

    ## exclude intercepts
    ns <- sum(1 * (nam == "Intercept" | nam == "(Intercept)"))
    if (ns > 0) {
        v <- v[-(1:ns), -(1:ns), drop = FALSE]
        nam <- nam[-(1:ns)]
    }
    
    d <- diag(v)^0.5
    v <- diag(solve(v/(d %o% d)))
    names(v) <- nam
    v
}

In [62]:
hasReviewComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where ReviewComments > 0")

hasReviewCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where ReviewComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where ReviewComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasReviewCommentsAndFromOutsider$ProjectName <- factor(hasReviewCommentsAndFromOutsider$ProjectName)
hasReviewCommentsAndFromOutsider$ProjectLanguage <- factor(hasReviewCommentsAndFromOutsider$ProjectLanguage)
hasReviewCommentsAndFromOutsider$IsMerged <- factor(hasReviewCommentsAndFromOutsider$IsMerged)

hasGeneralComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0")

hasGeneralCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where GeneralComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where GeneralComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasGeneralCommentsAndFromOutsider$ProjectName <- factor(hasGeneralCommentsAndFromOutsider$ProjectName)
hasGeneralCommentsAndFromOutsider$ProjectLanguage <- factor(hasGeneralCommentsAndFromOutsider$ProjectLanguage)
hasGeneralCommentsAndFromOutsider$IsMerged <- factor(hasGeneralCommentsAndFromOutsider$IsMerged)

summary(hasGeneralCommentsAndFromOutsider)

hasEffectiveComments <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0")

hasEffectiveCommentsAndFromOutsider <- sqldf("select *
                      from 'NumOfReviewCommentsData' 
                      where EffectiveComments > 0 and FromOutsider = 'True'
                        and ProjectName in (select ProjectName from 'NumOfReviewCommentsData'
                                            where EffectiveComments > 0 and FromOutsider = 'True'
                                            GROUP BY ProjectName
                                            HAVING Count() > 1)")

# Clear unused factors
hasEffectiveCommentsAndFromOutsider$ProjectName <- factor(hasEffectiveCommentsAndFromOutsider$ProjectName)
hasEffectiveCommentsAndFromOutsider$ProjectLanguage <- factor(hasEffectiveCommentsAndFromOutsider$ProjectLanguage)
hasEffectiveCommentsAndFromOutsider$IsMerged <- factor(hasEffectiveCommentsAndFromOutsider$IsMerged)

summary(hasEffectiveCommentsAndFromOutsider)

 EffectiveComments  ReviewComments    ShareReviewComments GeneralComments   
 Min.   :  0.0000   Min.   :  0.000   Min.   : 0.00       Min.   :   1.000  
 1st Qu.:  0.0000   1st Qu.:  0.000   1st Qu.: 0.00       1st Qu.:   1.000  
 Median :  0.0000   Median :  0.000   Median : 0.00       Median :   2.000  
 Mean   :  0.3629   Mean   :  1.611   Mean   :10.23       Mean   :   4.317  
 3rd Qu.:  0.0000   3rd Qu.:  0.000   3rd Qu.: 0.00       3rd Qu.:   5.000  
 Max.   :486.0000   Max.   :494.000   Max.   :99.72       Max.   :1035.000  
                                                                            
 GeneralCommentsDiscussingBuild   Additions         Deletions      
 Min.   :0                      Min.   :      0   Min.   :      0  
 1st Qu.:0                      1st Qu.:      0   1st Qu.:      0  
 Median :0                      Median :      5   Median :      1  
 Mean   :0                      Mean   :    809   Mean   :    337  
 3rd Qu.:0                      3rd Qu.:    

 EffectiveComments ReviewComments   ShareReviewComments GeneralComments  
 Min.   :  1.000   Min.   :  1.00   Min.   :  1.19      Min.   :  0.000  
 1st Qu.:  1.000   1st Qu.:  2.00   1st Qu.: 33.33      1st Qu.:  2.000  
 Median :  2.000   Median :  5.00   Median : 56.00      Median :  4.000  
 Mean   :  4.534   Mean   : 10.66   Mean   : 56.46      Mean   :  8.055  
 3rd Qu.:  4.000   3rd Qu.: 11.00   3rd Qu.: 78.26      3rd Qu.:  9.000  
 Max.   :486.000   Max.   :494.00   Max.   :100.00      Max.   :279.000  
                                                                         
 GeneralCommentsDiscussingBuild   Additions           Deletions       
 Min.   :0                      Min.   :      0.0   Min.   :     0.0  
 1st Qu.:0                      1st Qu.:      4.0   1st Qu.:     0.0  
 Median :0                      Median :     44.0   Median :     4.0  
 Mean   :0                      Mean   :    562.7   Mean   :   195.9  
 3rd Qu.:0                      3rd Qu.:    204.8   3

# Effective Comments

In [63]:
modelNumberEffectiveComments = lm(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(ReviewComments) +
            log(GeneralComments + 1) +
            IsAfter ,
          data= hasEffectiveComments)
summary(modelNumberEffectiveComments)
r.squaredGLMM(modelNumberEffectiveComments)
vif.mer(modelNumberEffectiveComments)
anova(modelNumberEffectiveComments)

print("---------------------------")

modelNumberEffectiveCommentsOutsider = lmer(log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            log(ReviewComments) +
            log(GeneralComments + 1) +
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasEffectiveCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberEffectiveCommentsOutsider)
r.squaredGLMM(modelNumberEffectiveCommentsOutsider)
vif.mer(modelNumberEffectiveCommentsOutsider)
anova(modelNumberEffectiveCommentsOutsider)


Call:
lm(formula = log(EffectiveComments) ~ log(Additions + 1) + log(Deletions + 
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles + 
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds + 
    1) + NewContributor + FromOutsider + log(ReviewComments) + 
    log(GeneralComments + 1) + IsAfter, data = hasEffectiveComments)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.69875 -0.38586  0.06333  0.41835  2.43731 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                0.0700067  0.0348624   2.008 0.044652 *  
log(Additions + 1)         0.0322033  0.0040785   7.896 3.08e-15 ***
log(Deletions + 1)        -0.0730931  0.0036823 -19.850  < 2e-16 ***
IsMergedTrue               0.0031361  0.0133825   0.234 0.814721    
log(Commits + 1)           0.0887199  0.0085546  10.371  < 2e-16 ***
log(Assignees + 1)        -0.0798556  0.0214893  -3.716 0.000203 ***
log(ChangedFiles + 1)    

ERROR: Error in UseMethod("fixef"): no applicable method for 'fixef' applied to an object of class "lm"


# Review comments

In [182]:
modelNumberReviewComments = lmer(log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewComments,
          REML=FALSE)
summary(modelNumberReviewComments)
r.squaredGLMM(modelNumberReviewComments)
vif.mer(modelNumberReviewComments)
anova(modelNumberReviewComments)

print("---------------------------")

modelNumberReviewCommentsOutsider = lmer(log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberReviewCommentsOutsider)
r.squaredGLMM(modelNumberReviewCommentsOutsider)
vif.mer(modelNumberReviewCommentsOutsider)
anova(modelNumberReviewCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ReviewComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: hasReviewComments

     AIC      BIC   logLik deviance df.resid 
109142.7 109281.6 -54555.4 109110.7    43425 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.0034 -0.6825 -0.0343  0.6573  5.2307 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.05035  0.2244  
 ProjectLanguage (Intercept) 0.00000  0.0000  
 Residual                    0.71490  0.8455  
Number of obs: 43441, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),741.139597,741.139597,1,43431.69,1036.706754,0.0
log(Deletions + 1),1.404495,1.404495,1,43427.54,1.964609,0.1610284
IsMerged,50.581748,50.581748,1,43439.49,70.753796,0.0
log(Commits + 1),955.004112,955.004112,1,43367.31,1335.860634,0.0
log(Assignees + 1),58.694967,58.694967,1,39945.72,82.102574,0.0
log(ChangedFiles + 1),35.481294,35.481294,1,43440.04,49.631266,1.882938e-12
log(NumOfUniqueUsers + 1),4822.86211,4822.86211,1,42444.34,6746.223978,0.0
log(PRsOpened + 1),4.704294,4.704294,1,42792.58,6.58037,0.01031432
log(TotalBuilds + 1),5.592043,5.592043,1,43328.5,7.822155,0.005163236
NewContributor,18.517221,18.517221,1,43429.99,25.901905,3.607098e-07


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ReviewComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 58907.4  59020.2 -29439.7  58879.4    23237 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.8294 -0.6836 -0.0317  0.6657  5.3166 

Random effects:
 Groups          Name        Variance  Std.Dev. 
 ProjectName     (Intercept) 5.590e-02 2.364e-01
 ProjectLanguage (Intercept) 1.294e-13 3.597e-07
 Residual                    7.261e-01 8.521e-01
Number of obs: 23251, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. E

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),277.6058108,277.6058108,1,23237.81,382.3485912,0.0
log(Deletions + 1),1.5339911,1.5339911,1,23235.14,2.1127776,0.1460875
log(Commits + 1),662.4909991,662.4909991,1,23197.06,912.4538838,0.0
log(Assignees + 1),12.1680905,12.1680905,1,21204.88,16.7592035,4.259309e-05
log(ChangedFiles + 1),26.9955919,26.9955919,1,23250.94,37.1812337,1.093414e-09
log(NumOfUniqueUsers + 1),2650.1938201,2650.1938201,1,22535.89,3650.1320734,0.0
log(PRsOpened + 1),5.5062321,5.5062321,1,22985.53,7.5837753,0.005894224
log(TotalBuilds + 1),10.4408553,10.4408553,1,23192.7,14.3802693,0.0001497377
NewContributor,6.3804506,6.3804506,1,23242.32,8.7878431,0.003035523
IsAfter,0.3706992,0.3706992,1,23212.67,0.5105668,0.4749008


# General comments

In [183]:
modelNumberGeneralComments = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelNumberGeneralComments)
r.squaredGLMM(modelNumberGeneralComments)
vif.mer(modelNumberGeneralComments)
anova(modelNumberGeneralComments)

print("---------------------------")

modelNumberGeneralCommentsOutsider = lmer(log(GeneralComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralCommentsAndFromOutsider,
          REML=FALSE)
summary(modelNumberGeneralCommentsOutsider)
r.squaredGLMM(modelNumberGeneralCommentsOutsider)
vif.mer(modelNumberGeneralCommentsOutsider)
anova(modelNumberGeneralCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: hasGeneralComments

      AIC       BIC    logLik  deviance  df.resid 
 200694.8  200853.1 -100331.4  200662.8    145674 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.6522 -0.4763 -0.1126  0.4258  8.7113 

Random effects:
 Groups          Name        Variance  Std.Dev.
 ProjectName     (Intercept) 0.0186657 0.1366  
 ProjectLanguage (Intercept) 0.0002075 0.0144  
 Residual                    0.2310259 0.4807  
Number of obs: 145690, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                       

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),8.333609,8.333609,1,145663.7,36.07218,1.905904e-09
log(Deletions + 1),3.734047,3.734047,1,145624.8,16.16289,5.815091e-05
IsMerged,63.25648,63.25648,1,145636.6,273.8068,0.0
log(Commits + 1),258.9738,258.9738,1,145681.0,1120.973,0.0
log(Assignees + 1),70.31775,70.31775,1,142627.6,304.3717,0.0
log(ChangedFiles + 1),0.05017641,0.05017641,1,145660.8,0.2171895,0.6411905
log(NumOfUniqueUsers + 1),60823.3,60823.3,1,145603.9,263274.8,0.0
log(PRsOpened + 1),2.973846,2.973846,1,145206.8,12.87235,0.0003335825
log(TotalBuilds + 1),6.88276,6.88276,1,145675.9,29.79215,4.817162e-08
NewContributor,1.861738,1.861738,1,145685.8,8.058566,0.004529527


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(GeneralComments) ~ log(Additions + 1) + log(Deletions + 1) +  
    log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasGeneralCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
118436.0 118567.4 -59204.0 118408.0    87946 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-5.8009 -0.4806 -0.1496  0.4307  8.9863 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.015741 0.12546 
 ProjectLanguage (Intercept) 0.000105 0.01025 
 Residual                    0.223532 0.47279 
Number of obs: 87960, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. Error  

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),14.9604,14.9604,1,87952.63,66.92749,4.440892e-16
log(Deletions + 1),0.400663,0.400663,1,87931.03,1.792423,0.1806347
log(Commits + 1),165.8048,165.8048,1,87959.71,741.7514,0.0
log(Assignees + 1),37.06881,37.06881,1,84720.7,165.8326,0.0
log(ChangedFiles + 1),0.8447748,0.8447748,1,87944.35,3.77922,0.05189603
log(NumOfUniqueUsers + 1),41413.04,41413.04,1,87835.93,185267.1,0.0
log(PRsOpened + 1),3.215397,3.215397,1,87439.47,14.38454,0.0001491213
log(TotalBuilds + 1),1.601488,1.601488,1,87953.77,7.164482,0.007437494
NewContributor,1.508562,1.508562,1,87937.51,6.748767,0.009382812
IsAfter,14.60691,14.60691,1,87953.14,65.34608,6.661338e-16


In [184]:
modelShareReviewComments = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) +             
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= NumOfReviewCommentsData, 
          REML=FALSE)
summary(modelShareReviewComments)
r.squaredGLMM(modelShareReviewComments)
vif.mer(modelShareReviewComments)
anova(modelShareReviewComments)

print("---------------------------")

modelShareReviewCommentsOutsider = lmer(log(ShareReviewComments + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            #IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            #FromOutsider + 
            IsAfter +
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasReviewCommentsAndFromOutsider,
          REML=FALSE)
summary(modelShareReviewCommentsOutsider)
r.squaredGLMM(modelShareReviewCommentsOutsider)
vif.mer(modelShareReviewCommentsOutsider)
anova(modelShareReviewCommentsOutsider)


Correlation matrix not shown by default, as p = 13 > 12.
Use print(obj, correlation=TRUE)  or
	 vcov(obj)	 if you need it



Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + IsMerged + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + FromOutsider + IsAfter + (1 | ProjectLanguage) +  
    (1 | ProjectName)
   Data: NumOfReviewCommentsData

      AIC       BIC    logLik  deviance  df.resid 
 732044.3  732208.8 -366006.1  732012.3    215434 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5813 -0.6857 -0.1697  0.3587  3.8576 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.114964 0.33906 
 ProjectLanguage (Intercept) 0.001188 0.03447 
 Residual                    1.744308 1.32072 
Number of obs: 215450, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
              

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),6241.479,6241.479,1,215434.3,3578.198,0.0
log(Deletions + 1),78.19538,78.19538,1,215396.5,44.82889,2.15552e-11
IsMerged,4139.119,4139.119,1,215424.0,2372.929,0.0
log(Commits + 1),3395.634,3395.634,1,215444.4,1946.694,0.0
log(Assignees + 1),276.5964,276.5964,1,208981.5,158.5709,0.0
log(ChangedFiles + 1),1633.009,1633.009,1,215433.8,936.193,0.0
log(NumOfUniqueUsers + 1),73953.63,73953.63,1,214974.7,42397.12,0.0
log(PRsOpened + 1),0.7916288,0.7916288,1,214351.4,0.4538355,0.5005199
log(TotalBuilds + 1),460.7265,460.7265,1,215430.3,264.1314,0.0
NewContributor,222.7642,222.7642,1,215445.3,127.7092,0.0


[1] "---------------------------"


Linear mixed model fit by maximum likelihood t-tests use Satterthwaite
  approximations to degrees of freedom [lmerMod]
Formula: log(ShareReviewComments + 1) ~ log(Additions + 1) + log(Deletions +  
    1) + log(Commits + 1) + log(Assignees + 1) + log(ChangedFiles +  
    1) + log(NumOfUniqueUsers + 1) + log(PRsOpened + 1) + log(TotalBuilds +  
    1) + NewContributor + IsAfter + (1 | ProjectLanguage) + (1 |  
    ProjectName)
   Data: hasReviewCommentsAndFromOutsider

     AIC      BIC   logLik deviance df.resid 
 40615.7  40728.5 -20293.9  40587.7    23237 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-4.5711 -0.5925  0.1944  0.7373  2.8794 

Random effects:
 Groups          Name        Variance Std.Dev.
 ProjectName     (Intercept) 0.02936  0.1714  
 ProjectLanguage (Intercept) 0.00000  0.0000  
 Residual                    0.33031  0.5747  
Number of obs: 23251, groups:  ProjectName, 180; ProjectLanguage, 33

Fixed effects:
                            Estimate Std. 

Unnamed: 0,Sum Sq,Mean Sq,NumDF,DenDF,F.value,Pr(>F)
log(Additions + 1),29.28928565,29.28928565,1,23247.96,88.6719385,0.0
log(Deletions + 1),0.06540722,0.06540722,1,23226.35,0.1980173,0.6563302
log(Commits + 1),11.80634012,11.80634012,1,23227.2,35.7431409,2.284105e-09
log(Assignees + 1),5.08837456,5.08837456,1,21696.19,15.4048153,8.703461e-05
log(ChangedFiles + 1),5.05766343,5.05766343,1,23249.35,15.3118388,9.140293e-05
log(NumOfUniqueUsers + 1),981.67127259,981.67127259,1,22769.94,2971.9637321,0.0
log(PRsOpened + 1),12.78507641,12.78507641,1,23088.92,38.7062191,5.010996e-10
log(TotalBuilds + 1),0.16004122,0.16004122,1,23223.39,0.4845173,0.4863902
NewContributor,0.60270384,0.60270384,1,23235.19,1.8246576,0.176773
IsAfter,4.46807717,4.46807717,1,23235.81,13.5268941,0.0002356992


modelBuildDiscussionComments = lmer(log(GeneralCommentsDiscussingBuild + 1) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(PrOpenedDaysAfterProjectStart + 1) +
            IsAfter + 
            (1|ProjectLanguage) +
            (1|ProjectName),
          data= hasGeneralComments, 
          REML=FALSE)
summary(modelBuildDiscussionComments)
r.squaredGLMM(modelBuildDiscussionComments)
vif.mer(modelBuildDiscussionComments)
anova(modelBuildDiscussionComments)

In [185]:
library(lme4)

print(sprintf("R2c of review comments is %f", r.squaredGLMM(modelNumberReviewComments)[['R2c']]))
print(sprintf("R2c of share review comments is %f", r.squaredGLMM(modelShareReviewComments)[['R2c']]))
print(sprintf("R2c of general comments is %f", r.squaredGLMM(modelNumberGeneralComments)[['R2c']]))



[1] "R2c of review comments is 0.342312"
[1] "R2c of share review comments is 0.308417"
[1] "R2c of general comments is 0.717337"


# Using effective comments to make predictions

In [66]:
projectNames <- sqldf("select distinct(ProjectName) as ProjectName
                    from hasReviewComments")$ProjectName


result <- data.frame(Name = '', Actual = 0, Predicted = 0)[0,]

for(project in projectNames) {
    projectData <- sqldf(sprintf("select * from hasReviewComments
                            where ProjectName = '%s'", project))
    
    log(EffectiveComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            #log(PrOpenedDaysAfterProjectStart + 1) +
            FromOutsider + 
            log(ReviewComments) +
            log(GeneralComments + 1) +
            IsAfter 
    
    medianRow <- data.frame(Additions = median(projectData$Additions),
                           Deletions = median(projectData$Deletions),
                           IsMerged = 'True',
                           Commits = median(projectData$Commits),
                           Assignees = median(projectData$Assignees),
                           ChangedFiles = median(projectData$ChangedFiles),
                           NumOfUniqueUsers = median(projectData$NumOfUniqueUsers),
                           PRsOpened = median(projectData$PRsOpened),
                           TotalBuilds = median(projectData$TotalBuilds),
                           NewContributor = 'False',
                           FromOutsider = 'False',
                           ReviewComments = median(projectData$ReviewComments),
                           GeneralComments = median(projectData$GeneralComments),
                           IsAfter = 'True')[1,]
    
       
    
    predicted <- predict(modelNumberEffectiveComments, medianRow)
    
    result<-rbind(result, data.frame(Name = project, Actual = median(projectData$EffectiveComments), Predicted = predicted))
}

In [67]:
summary(result)

                          Name         Actual          Predicted      
 Leaflet/Leaflet            :  1   Min.   :0.00000   Min.   :-0.2956  
 MarkUsProject/Markus       :  1   1st Qu.:0.00000   1st Qu.: 0.2944  
 MightyPirates/OpenComputers:  1   Median :0.00000   Median : 0.4199  
 NancyFx/Nancy              :  1   Mean   :0.09553   Mean   : 0.4131  
 OP2/PyOP2                  :  1   3rd Qu.:0.00000   3rd Qu.: 0.5381  
 OPM/opm-core               :  1   Max.   :1.00000   Max.   : 0.9553  
 (Other)                    :240                                      

# Finding individual projects which are impacted by Travis

In [186]:
#summary(NumOfReviewCommentsData)

projectNames <- sqldf("select ProjectName, count(ProjectName) 
                    from NumOfReviewCommentsData
                    group by ProjectName 
                    having Count(ProjectName) > 200")$ProjectName

for(project in projectNames) {
    formula <- log(ReviewComments) ~ 
            log(Additions + 1) +
            log(Deletions + 1) +
            IsMerged +
            log(Commits + 1) +
            log(Assignees + 1) + 
            log(ChangedFiles + 1) + 
            log(NumOfUniqueUsers + 1) +    
            log(PRsOpened + 1) +
            log(TotalBuilds + 1) +
            NewContributor + 
            FromOutsider + 
            IsAfter
    
    data <- sqldf(sprintf("select * from NumOfReviewCommentsData
                    where ProjectName ='%s'
                        and ReviewComments > 1", project))
    
    data$IsMerged <- factor(data$IsMerged)
    data$NewContributor <- factor(data$NewContributor)
    data$FromOutsider <- factor(data$FromOutsider)
    data$IsAfter <- factor(data$IsAfter)
    
    #Skip running an analysis if there is not enough data
    if(nlevels(data$IsMerged) > 1 &
      nlevels(data$NewContributor) > 1 &
      nlevels(data$FromOutsider) > 1 &
      nlevels(data$IsAfter) > 1)
    {
        model_per_project <- lm(formula, data=data)
        
        pOfIsAfter <- tail(summary(model_per_project)$coefficients)[,4]   ##P-values 
        rsq <- summary(model_per_project)$r.squared   
        
        #print(summary(model_per_project))
        
        print(sprintf("Name: %s pval: %f rsq: %f", project, pOfIsAfter, rsq))
    }
    
}

[1] "Name: AFNetworking/AFNetworking pval: 0.136075 rsq: 0.947911"
[2] "Name: AFNetworking/AFNetworking pval: 0.147730 rsq: 0.947911"
[3] "Name: AFNetworking/AFNetworking pval: 0.719298 rsq: 0.947911"
[4] "Name: AFNetworking/AFNetworking pval: 0.094202 rsq: 0.947911"
[5] "Name: AFNetworking/AFNetworking pval: 0.158648 rsq: 0.947911"
[6] "Name: AFNetworking/AFNetworking pval: 0.160747 rsq: 0.947911"
[1] "Name: AnalyticalGraphicsInc/cesium pval: 0.000001 rsq: 0.508137"
[2] "Name: AnalyticalGraphicsInc/cesium pval: 0.058781 rsq: 0.508137"
[3] "Name: AnalyticalGraphicsInc/cesium pval: 0.298592 rsq: 0.508137"
[4] "Name: AnalyticalGraphicsInc/cesium pval: 0.463259 rsq: 0.508137"
[5] "Name: AnalyticalGraphicsInc/cesium pval: 0.000007 rsq: 0.508137"
[6] "Name: AnalyticalGraphicsInc/cesium pval: 0.525550 rsq: 0.508137"
[1] "Name: Automattic/_s pval: 0.240493 rsq: 0.582836"
[2] "Name: Automattic/_s pval: 0.542499 rsq: 0.582836"
[3] "Name: Automattic/_s pval: 0.004741 rsq: 0.582836"
[4] "Name: Au