In [8]:
import numpy as np 
import pandas as pd 
from scipy.special import expit 
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression


In [15]:
N = 10000
 
neighborhood = np.array(range(N))
 
industry = neighborhood % 3
 
race = ((neighborhood % 3
         + np.random.binomial(3, p=0.2, size=N))) % 4 
income = np.random.gamma(25, 1000*(industry + 1))
crime = np.random.gamma(100000. / income, 100, size=N)
X = pd.DataFrame({'R': race, '$I$': income, 'C': crime, 
                  '$E$': industry, '$N$': neighborhood})

In [3]:
X['intercept'] = 1.
X

Unnamed: 0,$R$,$I$,$C$,$E$,$N$,intercept
0,1,29285.311475,214.146364,0,0,1.0
1,1,67107.539344,165.482690,1,1,1.0
2,2,59302.921940,151.050767,2,2,1.0
3,1,26809.026516,509.851086,0,3,1.0
4,1,54840.067459,27.042228,1,4,1.0
...,...,...,...,...,...,...
9995,0,58842.040451,74.333374,2,9995,1.0
9996,1,32297.726097,485.512124,0,9996,1.0
9997,3,38787.266929,111.779990,1,9997,1.0
9998,3,75214.719505,45.459760,2,9998,1.0


In [28]:
model = sm.OLS(X['R'], X[['C', '$I$']])
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,R,R-squared (uncentered):,0.788
Model:,OLS,Adj. R-squared (uncentered):,0.788
Method:,Least Squares,F-statistic:,18590.0
Date:,"Tue, 25 Aug 2020",Prob (F-statistic):,0.0
Time:,14:20:20,Log-Likelihood:,-12083.0
No. Observations:,10000,AIC:,24170.0
Df Residuals:,9998,BIC:,24180.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
C,0.0002,2.93e-05,5.190,0.000,9.46e-05,0.000
$I$,2.781e-05,1.76e-07,157.579,0.000,2.75e-05,2.82e-05

0,1,2,3
Omnibus:,93.784,Durbin-Watson:,2.042
Prob(Omnibus):,0.0,Jarque-Bera (JB):,117.493
Skew:,-0.157,Prob(JB):,3.07e-26
Kurtosis:,3.428,Cond. No.,199.0


In [33]:
model = LogisticRegression()
# rotate_model = rotate_model.fit(tstDataPd[['rotatemean']], tstDataPd['pressuremean'])
# rotate_model = rotate_model.fit(np.reshape(X['$C$'], (-1,1)), X['$R$'])
model = model.fit(X[['C', '$I$']], X['R'])
# model = model.fit(X[['R']], X['C'])
score = model.score(X[['C', '$I$']], X['R']) #Coefficient of determination of R^2
score

0.3985

In [34]:
model.coef_

array([[ 0.00301691],
       [ 0.0017371 ],
       [-0.00150729],
       [-0.00324672]])

In [5]:
N = 10000
neighborhood = np.array(range(N))
industry = neighborhood % 3
industry

array([0, 1, 2, ..., 1, 2, 0])

In [11]:
# np.random.binomial(3, p=0.2, size=N)
income = np.random.gamma(25, 1000*(industry + 1))
income.shape

(10000,)

In [12]:
crime = np.random.gamma(100000. / income, 100, size=N)
crime

array([403.452856  , 232.40954591,  49.84048155, ..., 360.79110622,
       233.36969514, 291.17567791])

In [16]:
races = {0: 'african-american', 1: 'hispanic',
         2: 'asian', 3: 'white'}
X['race'] = X['$R$'].apply(lambda x: races[x])
X


Unnamed: 0,$R$,$I$,$C$,$E$,$N$,race
0,0,23473.205015,485.515695,0,0,african-american
1,1,40522.447011,411.520528,1,1,hispanic
2,2,73670.423225,224.012808,2,2,asian
3,0,32272.861538,547.730868,0,3,african-american
4,3,50028.649733,481.626668,1,4,white
...,...,...,...,...,...,...
9995,3,75949.078792,149.943943,2,9995,white
9996,1,21148.353307,588.665256,0,9996,hispanic
9997,1,44643.566381,96.493702,1,9997,hispanic
9998,2,79330.780632,115.147678,2,9998,asian


In [17]:
race_dummies = pd.get_dummies(X['race'])
race_dummies

Unnamed: 0,african-american,asian,hispanic,white
0,1,0,0,0
1,0,0,1,0
2,0,1,0,0
3,1,0,0,0
4,0,0,0,1
...,...,...,...,...
9995,0,0,0,1
9996,0,0,1,0
9997,0,0,1,0
9998,0,1,0,0


In [18]:
X[race_dummies.columns] = race_dummies

In [21]:
X

Unnamed: 0,$R$,$I$,$C$,$E$,$N$,race,african-american,asian,hispanic,white
0,0,23473.205015,485.515695,0,0,african-american,1,0,0,0
1,1,40522.447011,411.520528,1,1,hispanic,0,0,1,0
2,2,73670.423225,224.012808,2,2,asian,0,1,0,0
3,0,32272.861538,547.730868,0,3,african-american,1,0,0,0
4,3,50028.649733,481.626668,1,4,white,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
9995,3,75949.078792,149.943943,2,9995,white,0,0,0,1
9996,1,21148.353307,588.665256,0,9996,hispanic,0,0,1,0
9997,1,44643.566381,96.493702,1,9997,hispanic,0,0,1,0
9998,2,79330.780632,115.147678,2,9998,asian,0,1,0,0


In [25]:
N = 100000
inv_logit = expit
x1 = np.random.binomial(1, p=0.5, size=N)
x2 = np.random.binomial(1, p=inv_logit(-3.*x1))
x3 = np.random.binomial(1, p=inv_logit(3.*x1))
x4 = np.bitwise_or(x2, x3)
x5 = np.random.binomial(1, p=inv_logit(3.*x4))
 
X = pd.DataFrame({'$x_1$': x1, '$x_2$': x2, '$x_3$': x3,
                  '$x_4$': x4, '$x_5$': x5})
X

Unnamed: 0,$x_1$,$x_2$,$x_3$,$x_4$,$x_5$
0,1,0,1,1,0
1,1,0,1,1,1
2,1,0,1,1,1
3,1,0,1,1,0
4,1,0,1,1,1
...,...,...,...,...,...
99995,0,0,1,1,1
99996,0,1,0,1,1
99997,1,0,1,1,1
99998,0,0,1,1,1


In [55]:
N = 100000
inv_logit = expit
x1 = np.random.binomial(1, p=0.5, size=N)
x2 = np.random.binomial(1, p=0, size=N)
x3 = np.random.binomial(1, p=inv_logit(3.*x1))
x4 = np.bitwise_or(x2, x3)
x5 = np.random.binomial(1, p=inv_logit(3.*x4))
 
X = pd.DataFrame({'$x_1$': x1, '$x_2$': x2, '$x_3$': x3,
                               '$x_4$': x4, '$x_5$': x5})
X

Unnamed: 0,$x_1$,$x_2$,$x_3$,$x_4$,$x_5$
0,0,0,1,1,0
1,1,0,1,1,1
2,1,0,0,0,0
3,0,0,1,1,1
4,1,0,1,1,1
...,...,...,...,...,...
99995,0,0,0,0,0
99996,0,0,1,1,1
99997,0,0,1,1,1
99998,1,0,1,1,1


In [56]:
X.groupby('$x_2$').mean()[['$x_5$']]

Unnamed: 0_level_0,$x_5$
$x_2$,Unnamed: 1_level_1
0,0.82845


In [54]:
X['$x_5$'].mean()

0.9536

In [57]:
X['$x_5$'].mean()

0.82845