In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
from scipy import stats
agw2017 = pd.read_stata("agw2017")
agw2013 = pd.read_stata("agw2013")
agw2007 = pd.read_stata("agw2007")

gez2017 = pd.read_stata("gez2017")
gez2013 = pd.read_stata("gez2013")
gez2007 = pd.read_stata("gez2007")




In [3]:
print(agw2013["b26ogb"])

0       225000.000000
1       220000.000000
2                 NaN
3            0.000000
4                 NaN
5       255000.000000
6       150000.000000
7                 NaN
8       175000.000000
9       200000.000000
10      260000.015625
11      200000.000000
12                NaN
13           0.000000
14           0.000000
15                NaN
16           0.000000
17      400000.000000
18                NaN
19      545000.000000
20           0.000000
21           0.000000
22      245000.000000
23           0.000000
24           0.000000
25      125000.000000
26      175000.000000
27      200000.000000
28      479999.968750
29      400000.000000
            ...      
2011              NaN
2012              NaN
2013         0.000000
2014         0.000000
2015    260000.015625
2016              NaN
2017              NaN
2018    255000.000000
2019              NaN
2020         0.000000
2021              NaN
2022         0.000000
2023    370000.000000
2024              NaN
2025      

Vragenlijst Wonen en Hypotheken
B26Og eigenaar van woning (1=ja, 0=nee)
B26Hy hypotheken op de woning
B26Vz cash value levensverzekering hypotheek woning
B27Og eigenaar van tweede woning (1=ja, 0=nee)
B27Hy hypotheken op tweede woning
B27Vz cash value levensverzekering hypotheek tweede woning

### Eerst maak ik de Net-worth variabele

Dit doe ik dmv een functie die alle bezittingen (b1b, b2b, ..., b30b) bij elkaar optelt en daar de schulden (s1b, s2b, ..., s8b, x1b) vanaf trekt.

In [4]:
def calcWealth(row):
    wealth = (row["b1b"] + row["b2b"] + row["b3b"] + row["b4b"] + row["b6b"] + row["b7b"] + row["b8b"] + row["b11b"]
              + row["b12b"] + row["b13b"] + row["b14b"] + row["b15b"] + row["b16b"] + row["b17b"] + row["b18b"] + 
             row["b19ogb"] + row["b19hyb"] + row["b19vzb"] + row["b20b"] + row["b21b"] + row["b22b"] + row["b23b"] +
             row["b24b"] + row["b25b"] + row["b28b"] + row["b29b"] + row["b30b"])
    # Door te checken of b26ogb > -1 zorg ik dat enkel rows zonder NaN hier worden gebruikt. Ze zijn of volledig Nan of niet
    if row["b26ogb"] > -1:
        wealth += row["b26ogb"] + row["b26vzb"] + row["b27ogb"] + row["b27vzb"]
    return wealth

In [5]:
def calcDebt(row):
    debt = (row["s1b"] + row["s2b"] + row["s3b"] + row["s4b"] + row["s5b"] + row["s6b"] + row["s7b"] + row["s8b"] +
           row["x1b"]) 
    # Door te checken of b26ogb > -1 zorg ik dat enkel rows zonder NaN hier worden gebruikt. Ze zijn of volledig Nan of niet
    if row["b26ogb"] > -1:
        debt += row["b26hyb"] + row["b27hyb"]
    
    return debt

In [6]:
def calcNetWorth(row):
    netWorth = row["wealth"] - row["debt"]
    return netWorth

In [7]:
agw2017["debt"] = agw2017.apply(calcDebt, axis = 1)
agw2013["debt"] = agw2013.apply(calcDebt, axis = 1)
agw2007["debt"] = agw2007.apply(calcDebt, axis = 1)




In [8]:
agw2017["wealth"] = agw2017.apply(calcWealth, axis = 1)
agw2013["wealth"] = agw2013.apply(calcWealth, axis = 1)
agw2007["wealth"] = agw2007.apply(calcWealth, axis = 1)



In [9]:
agw2017["netWorth"] = agw2017.apply(calcNetWorth, axis = 1)
agw2013["netWorth"] = agw2013.apply(calcNetWorth, axis = 1)
agw2007["netWorth"] = agw2007.apply(calcNetWorth, axis = 1)

In [10]:
# Counter om aantal positieve en negatieve networths te tellen
pos = 0
neg = 0
zero = 0

# Arrays om de negatieven en positieven networths bij te houden
negatives = []
positives = []

# Bepaald voor iedere row of de networth positief of negatief is en zet deze waarde in een array
for x in range(0, agw2013.shape[0] - 1):
    if agw2013.loc[x,"netWorth"] > 0:
        positives.append(agw2013.loc[x, "netWorth"])
        pos += 1
    elif agw2013.loc[x, "netWorth"] < 0:
        negatives.append(agw2013.loc[x, "netWorth"])
        neg += 1
    else:
        zero += 1
        
print("aantal positieve networth: " + str(pos) +  " aantal negatieve networth: " + str(neg))
print("aantal met net worth nul (waarschijnlijk lege entries): " + str(zero))

aantal positieve networth: 1722 aantal negatieve networth: 219
aantal met net worth nul (waarschijnlijk lege entries): 99


In [11]:
# negatives.sort()
# positives.sort()

# plt.hist(positives)
# plt.title("positive net worths")
# plt.show()

# plt.hist(positives, range = (0, 250000))
# plt.title("positive net worths met max wealth 250k")
# plt.show()

# plt.hist(negatives)
# plt.title("negative net worths")
# plt.show()

# plt.hist(negatives, range = (-25000, 0))
# plt.title("negative net worths with -25k max debt")
# plt.show()

# Hier boven:

het feit dat bezit tot 10x zo veel gaat als schuld, kan leiden tot ene verschil in effect op  gezondheid (bijvoorbeeld 10x minder sterk oid) Misschien hier nog iets mee doen!

In [12]:
merged2017 = pd.merge(gez2017, agw2017, on = "personid")
merged2013 = pd.merge(gez2013, agw2013, on = "personid")
merged2007 = pd.merge(gez2007, agw2007, on = "personid")


In [13]:
merged = pd.merge(merged2017, merged2013, on = "personid")
merged = pd.merge(merged, merged2007, on = "personid")


In [14]:
def selfAssedHealth(row):
    healthValues = {"Excellent" : 1, "Good" : 1, "Fair" : 1, "Not so good" : 0, "Poor" : 0,
                    "excellent" : 1, "good" : 1, "fair" : 1, "not so good" : 0, "poor" : 0} 
    
    return healthValues[row["gez3"]]


http://www.statsmodels.org/stable/discretemod.html

In [15]:
merged2017["health"] = merged2017.apply(selfAssedHealth, axis = 1)
merged2013["health"] = merged2013.apply(selfAssedHealth, axis = 1)
merged2007["health"] = merged2007.apply(selfAssedHealth, axis = 1)

In [16]:
def dummyGroup(row):
    if row["b26oga"] == 1:
        return 1
    else:
        return 0
        

In [17]:
merged2017["treated"] = merged2017.apply(dummyGroup, axis = 1)
merged2013["treated"] = merged2013.apply(dummyGroup, axis = 1)
merged2007["treated"] = merged2007.apply(dummyGroup, axis = 1)


In [18]:
merged2017["time"] = 1
merged2013["time"] = 1
merged2007["time"] = 0

In [19]:
def interaction(row):
    return (row["time"] * row["treated"])

In [20]:
merged2017["DID"] = merged2017.apply(interaction, axis = 1)
merged2013["DID"] = merged2013.apply(interaction, axis = 1)
merged2007["DID"] = merged2007.apply(interaction, axis = 1)

In [21]:
merged2017["2013"] = 0
merged2017["2017"] = 1
merged2013["2013"] = 1
merged2013["2017"] = 0
merged2007["2013"] = 0
merged2007["2017"] = 0

In [22]:
frames0713 = [merged2007, merged2013]
frames0717 = [merged2007, merged2017]
frames071317 = [merged2007, merged2013, merged2017]
list0713 = pd.concat(frames0713)
list0717 = pd.concat(frames0717)
list071317 = pd.concat(frames071317)


In [23]:
merged2013["time"] = 0
merged2013["DID"] = merged2013.apply(interaction, axis = 1)
frames1317 = [merged2013, merged2017]
list1317 = pd.concat(frames1317)

In [24]:
list0713.to_stata("list0713.dta")
list0717.to_stata("list0717.dta")
list071317.to_stata("list071317.dta")
list1317.to_stata("list1317.dta")

/home/niels/anaconda3/lib/python3.6/site-packages/pandas/io/stata.py:2086: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    b'2013'   ->   _2013
    b'2017'   ->   _2017

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



In [25]:
def nHealthy(data):
    healthy = 0
    unhealthy = 0
    for x in range(0, data.shape[0] - 1):
        if data.loc[x, "health"] == 1:
            healthy += 1
        else:
            unhealthy += 1
    print("aantal healthy: " + str(healthy) + " en aantal unhealthy: " + str(unhealthy))

In [26]:
nHealthy(merged2013)

aantal healthy: 1574 en aantal unhealthy: 74


In [27]:
nHealthy(merged2007)

aantal healthy: 1731 en aantal unhealthy: 77


In [28]:
nHealthy(merged2017)

aantal healthy: 2091 en aantal unhealthy: 123


In [29]:
frames = [merged2007, merged2013, merged2017]
langeLijst = pd.concat(frames)

In [30]:
langeLijst.to_stata("langeLijst.dta")

/home/niels/anaconda3/lib/python3.6/site-packages/pandas/io/stata.py:2086: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    b'2013'   ->   _2013
    b'2017'   ->   _2017

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



In [31]:
result = sm.ols(formula="health ~ time + treated + DID", data=list1317).fit()


In [32]:
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 health   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     7.821
Date:                Mon, 14 Jan 2019   Prob (F-statistic):           3.34e-05
Time:                        11:53:20   Log-Likelihood:                 380.15
No. Observations:                3864   AIC:                            -752.3
Df Residuals:                    3860   BIC:                            -727.3
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.9513      0.008    125.810      0.0

In [33]:
result = sm.ols(formula="health ~ time + treated + DID", data=list0717).fit()


In [34]:
list0717["netWorth"]

0       188038.887314
1        10777.000000
2        61950.015625
3         6000.000000
4       163442.999512
5            0.000000
6       191498.640625
7            0.000000
8       173781.000000
9            0.000000
10        4025.000000
11      221629.999023
12         500.000000
13        2925.000000
14      598782.000000
15       21000.000000
16        4885.500000
17       36333.000000
18      529713.000000
19      217500.000000
20         239.999985
21      180726.205078
22        2280.000000
23           0.000000
24           0.000000
25           0.000000
26        1700.000001
27      173925.756104
28       -2559.500000
29      362769.000000
            ...      
2185    424500.000000
2186     20474.000000
2187    318170.031006
2188     83258.838501
2189    260000.015625
2190     15000.000000
2191     35851.000000
2192         0.000000
2193    332354.750000
2194    143957.867188
2195     13170.661621
2196      6000.000000
2197     42384.888550
2198      6250.000000
2199     5

In [35]:
merged2017.to_stata("merged_data2017.dta")
merged2013.to_stata("merged_data2013.dta")
merged2007.to_stata("merged_data2007.dta")

/home/niels/anaconda3/lib/python3.6/site-packages/pandas/io/stata.py:2086: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    b'2013'   ->   _2013
    b'2017'   ->   _2017

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)



In [36]:
merged.to_stata("merged_data.dta")

In [37]:
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)


In [80]:
samelist0713 = list0713[list0713.duplicated(["personid"], keep = False)]
samelist0717 = list0717[list0717.duplicated(["personid"], keep = False)]
samelist1317 = list1317[list1317.duplicated(["personid"], keep = False)]

In [79]:
samelist0713.to_stata("samelist0713.dta")
samelist0717.to_stata("samelist0717.dta)

0           601.0
0           601.0
2          2101.0
1          2101.0
3          2102.0
2          2102.0
4          3801.0
3          3801.0
5          3802.0
4          3802.0
5         10601.0
6         10601.0
8         11601.0
6         11601.0
7         11602.0
9         11602.0
8         17301.0
10        17301.0
9         17501.0
11        17501.0
10        26202.0
18        26202.0
11        26801.0
19        26801.0
12        26802.0
20        26802.0
14        33902.0
26        33902.0
27        35702.0
15        35702.0
          ...    
1593    8638202.0
1778    8638202.0
1781    8715502.0
1600    8715502.0
1603    8736701.0
1782    8736701.0
1604    8736702.0
1783    8736702.0
1607    8743701.0
1784    8743701.0
1609    8758401.0
1785    8758401.0
1610    8758402.0
1786    8758402.0
1615    8809201.0
1789    8809201.0
1630    8877801.0
1792    8877801.0
1793    8877802.0
1631    8877802.0
1636    8921301.0
1800    8921301.0
1637    8922301.0
1801    8922301.0
1645    89

(3458, 151)

In [None]:
sdfsd