In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.discrete.count_model as smdc
import patsy

In [3]:
fake_cres=pd.read_csv("fake_cres.csv").drop("Unnamed: 0",axis=1)

In [4]:
fake_cres


Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,1,0
1,nobody,brain,1,0
2,nobody,brain,1,0
3,nobody,brain,1,0
4,nobody,brain,1,0
...,...,...,...,...
14307,neurogene,blood,3,7
14308,neurogene,blood,3,26
14309,neurogene,blood,3,7
14310,neurogene,blood,3,15


In [5]:
fake_cres_munged=fake_cres
fake_cres_munged["replicate_ID"]=fake_cres_munged["replicate_ID"].map({1:"rep1",2:"rep2",3:"rep3"})

Some of these are strings or ints when they should be categorical...

We can either cast to categorical in the dataframe, or specify that the values should be categorical in the formula (for items in the formula) or one-hot with `pd.dummies` for the statsmodels interface.

A couple different ways of stipulating the model. 
- String formula can't use the pipe operator to pass a second formula 

In [6]:
fake_cres_munged

Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,rep1,0
1,nobody,brain,rep1,0
2,nobody,brain,rep1,0
3,nobody,brain,rep1,0
4,nobody,brain,rep1,0
...,...,...,...,...
14307,neurogene,blood,rep3,7
14308,neurogene,blood,rep3,26
14309,neurogene,blood,rep3,7
14310,neurogene,blood,rep3,15


In [7]:

y, X = patsy.dmatrices("umi_count ~ C(CRE)*C(Cell_type)-1",
                        fake_cres_munged, return_type='dataframe')
Z = patsy.dmatrix("C(replicate_ID)", fake_cres_munged, return_type='dataframe')

zinb_model = smdc.ZeroInflatedNegativeBinomialP(y, X, exog_infl=Z)

n_count_params = zinb_model.exog.shape[1]      # Count model parameters
n_infl_params = zinb_model.exog_infl.shape[1]    # Inflation model parameters
n_total = n_count_params + n_infl_params + 1 # adding 1 for alpha
start_params = np.full(n_total, 0.1)

zinb_result = zinb_model.fit(start_params=start_params,maxiter=1000)

Optimization terminated successfully.
         Current function value: 1.483284
         Iterations: 84
         Function evaluations: 85
         Gradient evaluations: 85


https://stats.stackexchange.com/questions/284911/type-i-and-type-ii-negative-binomial-distribution-in-zero-inflated-negative-bino

https://www.statsmodels.org/stable/generated/statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.html

https://www.statsmodels.org/dev/generated/statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.from_formula.html#statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.from_formula

In [168]:
zinb_result.summary()

0,1,2,3
Dep. Variable:,umi_count,No. Observations:,14312.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,14302.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 19 Feb 2025",Pseudo R-squ.:,0.1283
Time:,14:45:28,Log-Likelihood:,-22025.0
converged:,True,LL-Null:,-25265.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_rep2,0.0065,0.031,0.210,0.834,-0.054,0.067
inflate_rep3,2.2290,0.051,43.738,0.000,2.129,2.329
Intercept,4.6398,0.028,164.531,0.000,4.585,4.695
C(CRE)[T.neurogene],-1.9018,0.042,-45.560,0.000,-1.984,-1.820
C(CRE)[T.nobody],-4.1589,0.060,-69.366,0.000,-4.276,-4.041
C(CRE)[T.redgene],0.0196,0.039,0.498,0.619,-0.058,0.097
C(CRE)[T.somebody],-2.1193,0.042,-50.544,0.000,-2.201,-2.037
C(Cell_type)[T.brain],0.0398,0.041,0.981,0.327,-0.040,0.119
C(CRE)[T.neurogene]:C(Cell_type)[T.brain],1.7895,0.058,30.613,0.000,1.675,1.904


# Recapitulation of $\theta$ (constant in mean-variance quadratic)

First, let's examine theta. That is, the theta in $\sigma^2=\mu+\mu^2/\theta$. 

From our generation step, we know the true $\theta$ value to be 0.3.

Statsmodels seems to call it alpha, but regardless, it's quite close to the real value.

In [173]:
zinb_result.params["alpha"]

np.float64(0.2972870921394772)

In [177]:
zinb_result.model.exog.shape

(14312, 10)

In [194]:
minimal_nb_design = X.drop_duplicates()
minimal_nb_design

Unnamed: 0,C(CRE)[everybody],C(CRE)[neurogene],C(CRE)[nobody],C(CRE)[redgene],C(CRE)[somebody],C(Cell_type)[T.brain],C(CRE)[T.neurogene]:C(Cell_type)[T.brain],C(CRE)[T.nobody]:C(Cell_type)[T.brain],C(CRE)[T.redgene]:C(Cell_type)[T.brain],C(CRE)[T.somebody]:C(Cell_type)[T.brain]
0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
440,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
904,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1355,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1798,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2263,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2762,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3266,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3767,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4262,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [190]:
zinb_result.params

inflate_Intercept                            1.355423
inflate_C(replicate_ID)[T.rep2]             -1.327281
inflate_C(replicate_ID)[T.rep3]              0.888705
C(CRE)[everybody]                            4.639833
C(CRE)[neurogene]                            2.740756
C(CRE)[nobody]                               0.665935
C(CRE)[redgene]                              4.659436
C(CRE)[somebody]                             2.525434
C(Cell_type)[T.brain]                        0.039876
C(CRE)[T.neurogene]:C(Cell_type)[T.brain]    1.786778
C(CRE)[T.nobody]:C(Cell_type)[T.brain]      -0.679159
C(CRE)[T.redgene]:C(Cell_type)[T.brain]     -1.273865
C(CRE)[T.somebody]:C(Cell_type)[T.brain]    -0.263831
alpha                                        0.286778
dtype: float64

In [193]:
inflation_betas=zinb_result.params[0:n_infl_params]
nb_betas=zinb_result.params[n_infl_params:n_count_params + n_infl_params]
#possibly a more robust way of doing this would be on the basis of names
#So long as none of the nb predictors have "inflate" in their names...


In [202]:
#make sure they are in the same order...


#minimal_nb_design.columns.to_list()

['C(CRE)[everybody]',
 'C(CRE)[neurogene]',
 'C(CRE)[nobody]',
 'C(CRE)[redgene]',
 'C(CRE)[somebody]',
 'C(Cell_type)[T.brain]',
 'C(CRE)[T.neurogene]:C(Cell_type)[T.brain]',
 'C(CRE)[T.nobody]:C(Cell_type)[T.brain]',
 'C(CRE)[T.redgene]:C(Cell_type)[T.brain]',
 'C(CRE)[T.somebody]:C(Cell_type)[T.brain]']