# A Comparison of Multivariate Models for Count Data

In [1]:
import matplotlib
%matplotlib nbagg
from matplotlib import pyplot

In [2]:
%reload_ext rpy2.ipython

In [3]:
%%R
library(MGLM)
library(bivpois)

In [4]:
from statiskit import (linalg,
                       core,
                       glm)

In [5]:
%%R
data('ex4.ita91')
ita91 = ex4.ita91
write.csv(ita91, 'ita91.csv', row.names=F)

In [6]:
data = core.read_csv('ita91.csv', sep=',', header=True)

In [7]:
mdata = data.extract(0,1)

In [8]:
mme = core.splitting_estimation(data = mdata,
                                splitting = core.splitting_selection('MN',
                                                                     'DM'),
                                sum = core.selection(core.outcome_type.DISCRETE,
                                                     estimators=[core.poisson_estimation('ml'),
                                                                 core.binomial_estimation('ml', force=True),
                                                                 core.negative_binomial_estimation('ml'),
                                                                 core.shifted_estimation(core.outcome_type.DISCRETE,
                                                                                         shift=-1,
                                                                                         estimator=core.logarithmic_estimation('ml'))]))

In [9]:
mme.sum.scores

[-565.3220940311769, -582.4311016773115, -565.9357309449964, -685.4209196019883]

In [10]:
mme.splitting.scores

[-287.03973181566863, -289.8784145864973]

In [11]:
%%R
N = as.matrix(ita91[,c(1,2)])
X =  as.matrix(ita91[,c(3,4)])

In [12]:
%R print(MGLMfit(N, dist='NegMN'))

        estimate         SE
p_g1  0.08983118 0.04110484
p_g2  0.06244363 0.02864577
phi  12.64413393 6.78869549

Distribution: Negative Multinomial
Log-likelihood: -844.3806
BIC: 1705.932
AIC: 1694.761
LRT test p value: NA
Iterations: 4


In [13]:
%R print(MGLMreg(N ~ X, dist='NegMN'))


Error in `[[<-.data.frame`(`*tmp*`, i, value = c(2L, 3L, 4L, 5L, 6L, 7L,  : 
  replacement has 612 rows, data has 306


  replacement has 612 rows, data has 306


 



In [14]:
%R model = lm.bp(g1~1, g2~1, l1l2=~1, data=ita91)
%R print(model)

           iter         loglike Rel.Dif.loglike 
          1.000        -848.242           1.848 
           iter         loglike Rel.Dif.loglike 
      2.000e+00      -8.454e+02       3.347e-03 
           iter         loglike Rel.Dif.loglike 
      3.000e+00      -8.454e+02       4.269e-07 
           iter         loglike Rel.Dif.loglike 
      4.000e+00      -8.454e+02       3.740e-07 
           iter         loglike Rel.Dif.loglike 
      5.000e+00      -8.454e+02       3.275e-07 
           iter         loglike Rel.Dif.loglike 
      6.000e+00      -8.454e+02       2.867e-07 
           iter         loglike Rel.Dif.loglike 
      7.000e+00      -8.454e+02       2.508e-07 
           iter         loglike Rel.Dif.loglike 
      8.000e+00      -8.454e+02       2.193e-07 
           iter         loglike Rel.Dif.loglike 
      9.000e+00      -8.454e+02       1.917e-07 
           iter         loglike Rel.Dif.loglike 
      1.000e+01      -8.454e+02       1.675e-07 
           iter     


Call:
lm.bp(l1 = g1 ~ 1, l2 = g2 ~ 1, l1l2 = ~1, l3 = ~1, data = ita91, 
    common.intercept = FALSE, zeroL3 = FALSE, maxit = 300, pres = 1e-08, 
    verbose = FALSE)

Coefficients:
(l1):(Intercept)  (l2):(Intercept)  (l3):(Intercept)  
          0.2184           -0.1796           -2.3458  



In [15]:
%R print(model$BIC)

Saturated   BivPois 
 4899.038  1710.050 


In [16]:
%R model = lm.bp(g1~team1+team2, g2~team1+team2, l1l2=~team1+team2, data=ita91)
%R print(model)

           iter         loglike Rel.Dif.loglike 
           1.00         -759.82            1.76 
           iter         loglike Rel.Dif.loglike 
      2.000e+00      -7.559e+02       5.182e-03 
           iter         loglike Rel.Dif.loglike 
      3.000e+00      -7.555e+02       4.531e-04 
           iter         loglike Rel.Dif.loglike 
      4.000e+00      -7.552e+02       4.379e-04 
           iter         loglike Rel.Dif.loglike 
      5.000e+00      -7.549e+02       4.371e-04 
           iter         loglike Rel.Dif.loglike 
      6.000e+00      -7.546e+02       4.337e-04 
           iter         loglike Rel.Dif.loglike 
      7.000e+00      -7.542e+02       4.269e-04 
           iter         loglike Rel.Dif.loglike 
      8.000e+00      -7.539e+02       4.168e-04 
           iter         loglike Rel.Dif.loglike 
      9.000e+00      -7.536e+02       4.037e-04 
           iter         loglike Rel.Dif.loglike 
      1.000e+01      -7.533e+02       3.880e-04 
           iter     


Call:
lm.bp(l1 = g1 ~ team1 + team2, l2 = g2 ~ team1 + team2, l1l2 = ~team1 + 
    team2, l3 = ~1, data = ita91, common.intercept = FALSE, zeroL3 = FALSE, 
    maxit = 300, pres = 1e-08, verbose = FALSE)

Coefficients:
    (l1):(Intercept)  (l1):team1Atalanta    (l1):team1Bari        
            -0.24829               0.16367               0.05607  
(l1):team1Cagliari    (l1):team1Cremonese   (l1):team1Fiorentina  
             0.61254               0.37469               0.96217  
(l1):team1Foggia      (l1):team1Genoa       (l1):team1Inter       
             1.20051               0.69340              -0.16676  
(l1):team1Juventus    (l1):team1Lazio       (l1):team1Milan       
             0.95972               1.01397               1.53802  
(l1):team1Napoli      (l1):team1Parma       (l1):team1Roma        
             1.26216               0.56203               0.56769  
(l1):team1Sampdoria   (l1):team1Torino      (l1):team1Verona      
             1.00913               0.81364 

In [17]:
%R print(model$BIC)

Saturated   BivPois 
 4899.038  1956.104 


In [18]:
%%R
data('rnaseq')
rnaseq['totalReads'] = log(rnaseq['totalReads'])
# rnaseq['X'] = apply(rnaseq[,c(1,2,3,4,5,6)], 1, sum)
write.table(rnaseq, 'rnaseq.csv', row.names=F)
# rnaseq = rnaseq[,-c(11)]

In [19]:
data = core.read_csv('rnaseq.csv', header=True)

In [20]:
mme = core.splitting_estimation(data = data.extract(0, 1, 2, 3 ,4, 5),
                                splitting = core.splitting_selection('MN',
                                                                     'DM'),
                                sum = core.selection(core.outcome_type.DISCRETE,
                                                     estimators=[core.poisson_estimation('ml'),
                                                                 core.binomial_estimation('ml', force=True),
                                                                 core.negative_binomial_estimation('ml'),
                                                                 core.shifted_estimation(core.outcome_type.DISCRETE,
                                                                                         shift=-1,
                                                                                         estimator=core.logarithmic_estimation('ml'))]))

In [21]:
mme.sum.scores

[-6537.059114151809, -13223.754412953454, -1339.360020084458, -1756.9627791576988]

In [22]:
mme.splitting.scores

[-19352.98491770746, -4984.56057498688]

In [23]:
%%R
library(MGLM)
library(bivpois)
N = as.matrix(rnaseq[,c(1,2,3,4,5,6)])
X = as.matrix(rnaseq[,c(7,8,9,10)])

In [24]:
%R print(MGLMfit(N, dist='NegMN'))

        estimate           SE
p_X1  0.31148586 0.0013619556
p_X2  0.10649139 0.0008502964
p_X3  0.09837308 0.0008192291
p_X4  0.35049562 0.0014253376
p_X5  0.09426337 0.0008029165
p_X6  0.02122027 0.0003891465
phi  12.23256918 1.2292533050

Distribution: Negative Multinomial
Log-likelihood: -20673.71
BIC: 41384.52
AIC: 41361.43
LRT test p value: NA
Iterations: 3


In [25]:
%R print(MGLMreg(N ~ X, dist='NegMN'))

Call: MGLMreg(formula = N ~ X, dist = "NegMN")

Coefficients:
$alpha
                       X1            X2            X3            X4
(Intercept) -13.587376137 -1.352181e+01 -22.380090898 -14.131338928
XtotalReads   0.907715672  8.504124e-01   1.242922188   0.915257796
Xtreatment   -0.753112730 -7.735067e-01   2.014641268   0.675195341
Xgender      -0.060696325  7.021728e-03  -0.069502706   0.012498726
Xage          0.002582547 -9.376888e-04   0.002915974  -0.002140646
                       X5            X6
(Intercept) -1.450769e+01 -18.526416540
XtotalReads  8.999176e-01   1.020359613
Xtreatment  -6.382956e-01  -0.730410081
Xgender     -8.368181e-02  -0.093397437
Xage        -8.241306e-04   0.008765665

$phi
         
31.60591 


Hypothesis test: 
             wald value    Pr(>wald)
(Intercept)   385.75443 3.224960e-80
XtotalReads   368.08192 2.020898e-76
Xtreatment  18377.53053 0.000000e+00
Xgender        54.84664 4.978064e-10
Xage           79.70908 4.103032e-15

Distribution: 

In [26]:
%R print(MGLMfit(N, dist='GDM'))

          estimate        SE
alpha_X1  3.741846 0.3670877
alpha_X2  2.400909 0.8154314
alpha_X3  1.558396 0.2331360
alpha_X4  6.988354 1.1647638
alpha_X5 20.689398 0.1492792
beta_X1   8.026379 0.9665023
beta_X2  11.038376 0.7259782
beta_X3   8.961428 0.2645201
beta_X4   2.702723 2.8717176
beta_X5   4.854816 0.6482710

Distribution: Generalized Dirichlet Multinomial
Log-likelihood: -4841.231
BIC: 9735.446
AIC: 9702.463
LRT test p value: <0.0001
Iterations: 59


In [27]:
%R print(MGLMreg(N ~ X, dist='GDM'))

Call: MGLMreg(formula = N ~ X, dist = "GDM")

Coefficients:
                alpha_X1     alpha_X2    alpha_X3      alpha_X4    alpha_X5
(Intercept)  5.987992852 -7.056673213  0.45608810 -10.120737500  2.63939594
XtotalReads -0.215098743  0.555972697  0.03955285   0.720358215 -0.01612137
Xtreatment  -0.047690534 -0.329319924  0.97935928   0.099957634  0.06339341
Xgender      0.233005806  0.374837886 -0.18642022  -0.202417040  0.14428946
Xage         0.006661015 -0.004342996  0.01936118   0.008173279  0.01239665
                 beta_X1       beta_X2    beta_X3     beta_X4     beta_X5
(Intercept)  4.661089454 -9.7891269676  7.0950614 -9.53000784 -1.68761454
XtotalReads -0.140896121  0.7138189416 -0.2229844  0.74314582  0.13398453
Xtreatment   0.628878471  0.7461977539 -1.5916297 -0.92371233 -0.04244096
Xgender      0.212071256  0.2732563514 -0.2331213 -0.27042831  0.12206235
Xage         0.003223835  0.0004533068  0.0159450  0.01254059  0.01918839

Hypothesis test: 
            wald valu