In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from datetime import datetime
from programs.utils.analyse_functies import *
SAVEFIGS = True

NUMYEARS = 15
PREFIX = 'loose'

MIN_BIRTH_INTERVAL_MONTH = 10
MAX_BIRTH_INTERVAL_MONTH = 12 * NUMYEARS
MIN_MAR_DIFF = 0
MAX_MAR_DIFF = 25 * 12

In [2]:
df = pd.read_csv('datafiles/dataframes/working/huwelijk_geboortes_familie_maanden_' + PREFIX + '.csv', sep="|")
df.event_date = pd.to_datetime(df.event_date)
df.mar_date = pd.to_datetime(df.mar_date)
df.sort_values(['id_1', 'event_date'], inplace=True)

In [3]:
df.columns

Index([u'id_1', u'event_date', u'dist', u'target_name_1', u'target_name_2',
       u'target_name_3', u'target_name_4', u'id_2', u'candidate_name_1',
       u'candidate_name_2', u'candidate_name_3', u'candidate_name_4',
       u'length_name_1', u'length_name_2', u'length_name_3', u'length_name_4',
       u'dist_name_1', u'dist_name_2', u'dist_name_3', u'dist_name_4',
       u'mar_date', u'date_diff_month'],
      dtype='object')

In [4]:
df

Unnamed: 0,id_1,event_date,dist,target_name_1,target_name_2,target_name_3,target_name_4,id_2,candidate_name_1,candidate_name_2,...,length_name_1,length_name_2,length_name_3,length_name_4,dist_name_1,dist_name_2,dist_name_3,dist_name_4,mar_date,date_diff_month
0,698559,1820-11-17 00:00:00,2,ferdinandus,kroes,victoria,strobbe,393954,ferdinand,kroes,...,11,5,8,7,2,0,0,0,1819-11-24 00:00:00,0.0
1,698559,1823-06-03 00:00:00,0,ferdinandus,kroes,victoria,strobbe,223482,ferdinandus,kroes,...,11,5,8,7,0,0,0,0,1819-11-24 00:00:00,31.0
2,698559,1825-05-17 00:00:00,0,ferdinandus,kroes,victoria,strobbe,334134,ferdinandus,kroes,...,11,5,8,7,0,0,0,0,1819-11-24 00:00:00,23.0
3,698559,1832-03-26 00:00:00,0,ferdinandus,kroes,victoria,strobbe,443443,ferdinandus,kroes,...,11,5,8,7,0,0,0,0,1819-11-24 00:00:00,82.0
4,698559,1839-06-23 00:00:00,0,ferdinandus,kroes,victoria,strobbe,624202,ferdinandus,kroes,...,11,5,8,7,0,0,0,0,1819-11-24 00:00:00,87.0
5,698560,1898-01-17 00:00:00,0,henricus,vries,joanna,heijtvelt,285369,henricus,vries,...,8,5,6,9,0,0,0,0,1897-05-14 00:00:00,0.0
6,698560,1899-03-21 00:00:00,0,henricus,vries,joanna,heijtvelt,537391,henricus,vries,...,8,5,6,9,0,0,0,0,1897-05-14 00:00:00,14.0
7,698560,1900-07-13,0,henricus,vries,joanna,heijtvelt,438361,henricus,vries,...,8,5,6,9,0,0,0,0,1897-05-14 00:00:00,16.0
8,698560,1907-03-11,3,henricus,vries,joanna,heijtvelt,596573,henricus,vries,...,8,5,6,9,0,0,0,3,1897-05-14 00:00:00,80.0
9,698560,1911-08-30,0,henricus,vries,joanna,heijtvelt,529099,henricus,vries,...,8,5,6,9,0,0,0,0,1897-05-14 00:00:00,53.0


In [5]:
df['mar_diff'] = (df.event_date.dt.year - df.mar_date.dt.year) * 12 + (df.event_date.dt.month - df.mar_date.dt.month)
df['abs_mar_diff'] = abs(df['mar_diff'])
df['birth_before_mar'] = df.event_date < df.mar_date
df.sort_values(['id_1', 'abs_mar_diff'], inplace=True)


In [6]:
df['date_diff_month'] = df.groupby(['id_1', 'birth_before_mar'])['event_date'].diff().astype('timedelta64[M]')


In [7]:
df.date_diff_month.fillna(0, inplace=True)

In [8]:
df['MIN_BIRTH_INTERVAL'] = (df.date_diff_month >= MIN_BIRTH_INTERVAL_MONTH) | (df.date_diff_month == 0)
df['MIN_MAR_INTERVAL'] = df.mar_diff >= MIN_MAR_DIFF


In [9]:
df['MAX_MAR_DIFF'] = (df.mar_diff <= MAX_MAR_DIFF)


In [10]:
df

Unnamed: 0,id_1,event_date,dist,target_name_1,target_name_2,target_name_3,target_name_4,id_2,candidate_name_1,candidate_name_2,...,dist_name_3,dist_name_4,mar_date,date_diff_month,mar_diff,abs_mar_diff,birth_before_mar,MIN_BIRTH_INTERVAL,MIN_MAR_INTERVAL,MAX_MAR_DIFF
0,698559,1820-11-17 00:00:00,2,ferdinandus,kroes,victoria,strobbe,393954,ferdinand,kroes,...,0,0,1819-11-24 00:00:00,0.0,12,12,False,True,True,True
1,698559,1823-06-03 00:00:00,0,ferdinandus,kroes,victoria,strobbe,223482,ferdinandus,kroes,...,0,0,1819-11-24 00:00:00,30.0,43,43,False,True,True,True
2,698559,1825-05-17 00:00:00,0,ferdinandus,kroes,victoria,strobbe,334134,ferdinandus,kroes,...,0,0,1819-11-24 00:00:00,23.0,66,66,False,True,True,True
3,698559,1832-03-26 00:00:00,0,ferdinandus,kroes,victoria,strobbe,443443,ferdinandus,kroes,...,0,0,1819-11-24 00:00:00,82.0,148,148,False,True,True,True
4,698559,1839-06-23 00:00:00,0,ferdinandus,kroes,victoria,strobbe,624202,ferdinandus,kroes,...,0,0,1819-11-24 00:00:00,86.0,235,235,False,True,True,True
5,698560,1898-01-17 00:00:00,0,henricus,vries,joanna,heijtvelt,285369,henricus,vries,...,0,0,1897-05-14 00:00:00,0.0,8,8,False,True,True,True
6,698560,1899-03-21 00:00:00,0,henricus,vries,joanna,heijtvelt,537391,henricus,vries,...,0,0,1897-05-14 00:00:00,14.0,22,22,False,True,True,True
7,698560,1900-07-13,0,henricus,vries,joanna,heijtvelt,438361,henricus,vries,...,0,0,1897-05-14 00:00:00,15.0,38,38,False,True,True,True
8,698560,1907-03-11,3,henricus,vries,joanna,heijtvelt,596573,henricus,vries,...,0,3,1897-05-14 00:00:00,79.0,118,118,False,True,True,True
9,698560,1911-08-30,0,henricus,vries,joanna,heijtvelt,529099,henricus,vries,...,0,0,1897-05-14 00:00:00,53.0,171,171,False,True,True,True


In [11]:
df['months_since_first_birth'] = df.groupby(['id_1', 'birth_before_mar'])['date_diff_month'].apply(lambda x: x.cumsum())

In [12]:
df

Unnamed: 0,id_1,event_date,dist,target_name_1,target_name_2,target_name_3,target_name_4,id_2,candidate_name_1,candidate_name_2,...,dist_name_4,mar_date,date_diff_month,mar_diff,abs_mar_diff,birth_before_mar,MIN_BIRTH_INTERVAL,MIN_MAR_INTERVAL,MAX_MAR_DIFF,months_since_first_birth
0,698559,1820-11-17 00:00:00,2,ferdinandus,kroes,victoria,strobbe,393954,ferdinand,kroes,...,0,1819-11-24 00:00:00,0.0,12,12,False,True,True,True,0.0
1,698559,1823-06-03 00:00:00,0,ferdinandus,kroes,victoria,strobbe,223482,ferdinandus,kroes,...,0,1819-11-24 00:00:00,30.0,43,43,False,True,True,True,30.0
2,698559,1825-05-17 00:00:00,0,ferdinandus,kroes,victoria,strobbe,334134,ferdinandus,kroes,...,0,1819-11-24 00:00:00,23.0,66,66,False,True,True,True,53.0
3,698559,1832-03-26 00:00:00,0,ferdinandus,kroes,victoria,strobbe,443443,ferdinandus,kroes,...,0,1819-11-24 00:00:00,82.0,148,148,False,True,True,True,135.0
4,698559,1839-06-23 00:00:00,0,ferdinandus,kroes,victoria,strobbe,624202,ferdinandus,kroes,...,0,1819-11-24 00:00:00,86.0,235,235,False,True,True,True,221.0
5,698560,1898-01-17 00:00:00,0,henricus,vries,joanna,heijtvelt,285369,henricus,vries,...,0,1897-05-14 00:00:00,0.0,8,8,False,True,True,True,0.0
6,698560,1899-03-21 00:00:00,0,henricus,vries,joanna,heijtvelt,537391,henricus,vries,...,0,1897-05-14 00:00:00,14.0,22,22,False,True,True,True,14.0
7,698560,1900-07-13,0,henricus,vries,joanna,heijtvelt,438361,henricus,vries,...,0,1897-05-14 00:00:00,15.0,38,38,False,True,True,True,29.0
8,698560,1907-03-11,3,henricus,vries,joanna,heijtvelt,596573,henricus,vries,...,3,1897-05-14 00:00:00,79.0,118,118,False,True,True,True,108.0
9,698560,1911-08-30,0,henricus,vries,joanna,heijtvelt,529099,henricus,vries,...,0,1897-05-14 00:00:00,53.0,171,171,False,True,True,True,161.0


In [13]:
df['MAX_BIRTH_INTERVAL'] = ((df.months_since_first_birth <= MAX_BIRTH_INTERVAL_MONTH) & (df.months_since_first_birth >= 0))

In [14]:
df['accepted'] = df.MIN_BIRTH_INTERVAL & df.MIN_MAR_INTERVAL & df.MAX_MAR_DIFF & df.MAX_BIRTH_INTERVAL & ~df.birth_before_mar

In [15]:
df

Unnamed: 0,id_1,event_date,dist,target_name_1,target_name_2,target_name_3,target_name_4,id_2,candidate_name_1,candidate_name_2,...,date_diff_month,mar_diff,abs_mar_diff,birth_before_mar,MIN_BIRTH_INTERVAL,MIN_MAR_INTERVAL,MAX_MAR_DIFF,months_since_first_birth,MAX_BIRTH_INTERVAL,accepted
0,698559,1820-11-17 00:00:00,2,ferdinandus,kroes,victoria,strobbe,393954,ferdinand,kroes,...,0.0,12,12,False,True,True,True,0.0,True,True
1,698559,1823-06-03 00:00:00,0,ferdinandus,kroes,victoria,strobbe,223482,ferdinandus,kroes,...,30.0,43,43,False,True,True,True,30.0,True,True
2,698559,1825-05-17 00:00:00,0,ferdinandus,kroes,victoria,strobbe,334134,ferdinandus,kroes,...,23.0,66,66,False,True,True,True,53.0,True,True
3,698559,1832-03-26 00:00:00,0,ferdinandus,kroes,victoria,strobbe,443443,ferdinandus,kroes,...,82.0,148,148,False,True,True,True,135.0,True,True
4,698559,1839-06-23 00:00:00,0,ferdinandus,kroes,victoria,strobbe,624202,ferdinandus,kroes,...,86.0,235,235,False,True,True,True,221.0,False,False
5,698560,1898-01-17 00:00:00,0,henricus,vries,joanna,heijtvelt,285369,henricus,vries,...,0.0,8,8,False,True,True,True,0.0,True,True
6,698560,1899-03-21 00:00:00,0,henricus,vries,joanna,heijtvelt,537391,henricus,vries,...,14.0,22,22,False,True,True,True,14.0,True,True
7,698560,1900-07-13,0,henricus,vries,joanna,heijtvelt,438361,henricus,vries,...,15.0,38,38,False,True,True,True,29.0,True,True
8,698560,1907-03-11,3,henricus,vries,joanna,heijtvelt,596573,henricus,vries,...,79.0,118,118,False,True,True,True,108.0,True,True
9,698560,1911-08-30,0,henricus,vries,joanna,heijtvelt,529099,henricus,vries,...,53.0,171,171,False,True,True,True,161.0,True,True


In [20]:
df.groupby(['MAX_BIRTH_INTERVAL', 'MAX_MAR_DIFF', 'MIN_BIRTH_INTERVAL', 'MIN_MAR_INTERVAL', 'birth_before_mar']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,id_1,event_date,dist,target_name_1,target_name_2,target_name_3,target_name_4,id_2,candidate_name_1,candidate_name_2,...,dist_name_1,dist_name_2,dist_name_3,dist_name_4,mar_date,date_diff_month,mar_diff,abs_mar_diff,months_since_first_birth,accepted
MAX_BIRTH_INTERVAL,MAX_MAR_DIFF,MIN_BIRTH_INTERVAL,MIN_MAR_INTERVAL,birth_before_mar,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
False,False,False,True,False,49,49,49,49,49,49,49,49,49,49,...,49,49,49,49,49,49,49,49,49,49
False,False,True,True,False,3927,3927,3927,3927,3927,3927,3927,3927,3927,3927,...,3927,3927,3927,3927,3927,3927,3927,3927,3927,3927
False,True,False,False,True,11134,11134,11134,11134,11134,11134,11134,11134,11134,11134,...,11134,11134,11134,11134,11134,11134,11134,11134,11134,11134
False,True,False,True,False,112,112,112,112,112,112,112,112,112,112,...,112,112,112,112,112,112,112,112,112,112
False,True,True,False,True,167,167,167,167,167,167,167,167,167,167,...,167,167,167,167,167,167,167,167,167,167
False,True,True,True,False,35857,35857,35857,35857,35857,35857,35857,35857,35857,35857,...,35857,35857,35857,35857,35857,35857,35857,35857,35857,35857
True,False,False,True,False,11,11,11,11,11,11,11,11,11,11,...,11,11,11,11,11,11,11,11,11,11
True,False,True,True,False,472,472,472,472,472,472,472,472,472,472,...,472,472,472,472,472,472,472,472,472,472
True,True,False,True,False,1374,1374,1374,1374,1374,1374,1374,1374,1374,1374,...,1374,1374,1374,1374,1374,1374,1374,1374,1374,1374
True,True,True,False,True,8732,8732,8732,8732,8732,8732,8732,8732,8732,8732,...,8732,8732,8732,8732,8732,8732,8732,8732,8732,8732


In [269]:
if SAVEFIGS:
    df.to_csv('datafiles/dataframes/working/huwelijk_geboortes_familie_maanden_' + PREFIX +'_extended_' + str(NUMYEARS) + 'years.csv', sep="|", index=False)