MASHUP DATASET B - PREGNANCY
<br>
This dataset aggregates observed values in Dataset 5 (live births), Dataset 6 (miscarriages) and Dataset 7 (induced abortions) in order to retreive absolute values of pregnancies whithin our class-age of interest, divided by region of event, regardless of how they terminated.
<br><br>
Age values will be aggregated to create an age span that goes from 15 years old to 24 years old. 
<br><br>
<b>N.B.</b> <span style="color: blue;">Dataset 5</span> does not have values for a class age of 15 -24 as the other two datasets have, instead, it gives values of live births aggregated for a class of women <span style="color: blue;">until 17 years old</span>. However, we considered valid to aggregate numbers from this dataset without further distinction of age, since values for pregnancies under 15 years old are very close to 0, hence irrelevant.
<br>
For this reason, since in <span style="color: blue;">Dataset 7</span> was also possible to keep values for individuals under 15 years old, we have decided to keep them too.

In [82]:
from pandas import *

# reading the datasets 

live_births = read_csv("../data/srcDS/D4Pregnancy/cleanedDS/cleanedD4-2017-Pregnancy.csv", keep_default_na=False,
            dtype= {
                "RESIDENCE_TERR":"string",
                "CITIZENSHIP_MOTHER": "string",
                "MOTHER_AGE" :"string",
                "OBS_VALUE" : "int64"
            })
miscarriages = read_csv("../data/srcDS/D5Pregnancy/cleanedDS/cleanedD5-2017-Pregnancy.csv", keep_default_na=False,
             dtype= {
                "Territorio":"string",
                "Classe di età": "string",
                "Value" : "int64"
            })
abortions = read_csv("../data/srcDS/D6Pregnancy/cleanedDS/cleanedD6-2017-Pregnancy.csv", keep_default_na=False,
             dtype= {
                "Territorio dell'evento":"string",
                "Età e classe di età": "string",
                "Value" : "int64"
            })

# aggregating age groups for LIVE BIRTHS

live_births = live_births.query("MOTHER_AGE != 'Y25'") 
live_births = live_births.query("RESIDENCE_TERR != 'ITD1'")
live_births = live_births.query("RESIDENCE_TERR != 'ITD2'") # create a df without data for Y25
live_births = live_births[["RESIDENCE_TERR", "OBS_VALUE"]] #take out the column with mother_age so I can sum all of the values per region
live_births = live_births.groupby("RESIDENCE_TERR", as_index=False).sum()


# create a dictionary to map every region ID with the Region's name
ID_name_dict = { "ITC1" : "Piemonte",
                "ITC2" : "Valle d'Aosta / Vallée d'Aoste",
                "ITC3" : "Liguria",
                "ITC4" : "Lombardia",
                "ITDA" : "Trentino Alto Adige / Südtirol",
                "ITD3" : "Veneto",
                "ITD4" : "Friuli-Venezia Giulia",
                "ITD5" : "Emilia-Romagna",
                "ITE1" : "Toscana",
                "ITE2" : "Umbria",
                "ITE3" : "Marche",
                "ITE4" : "Lazio",
                "ITF1" : "Abruzzo",
                "ITF2" : "Molise",
                "ITF3" : "Campania",
                "ITF4" : "Puglia",
                "ITF5" : "Basilicata",
                "ITF6" : "Calabria",
                "ITG1" : "Sicilia",
                "ITG2" : "Sardegna"
                }


i = 0

# add NL Region name to dataset

region_name = []

for idx, row in live_births.iterrows():
    region_name.append(ID_name_dict[live_births.at[idx, "RESIDENCE_TERR"]])

live_births["Region"] = region_name #maybe try inserting at idx 1 ??
live_births.rename(columns = {"RESIDENCE_TERR" : "ITTER107", "OBS_VALUE" : "Live_births"}, inplace=True)

live_births


Unnamed: 0,ITTER107,Live_births,Region
0,ITC1,5448,Piemonte
1,ITC2,142,Valle d'Aosta / Vallée d'Aoste
2,ITC3,1794,Liguria
3,ITC4,13468,Lombardia
4,ITD3,5944,Veneto
5,ITD4,1342,Friuli-Venezia Giulia
6,ITD5,6016,Emilia-Romagna
7,ITDA,1776,Trentino Alto Adige / Südtirol
8,ITE1,4392,Toscana
9,ITE2,1056,Umbria


In [83]:
# aggregating age groups for MISCARRIAGES
miscarriages = miscarriages[["Territorio", "Value"]]
miscarriages = miscarriages.groupby("Territorio", as_index=False).sum()

region_code = []
region_name = []

for key, value in ID_name_dict.items():
    region_name.append(value)
    region_code.append(key)

DF_ID_name = DataFrame({"ITTER107" : Series(region_code,  dtype="string"),  "Region": Series(region_name,  dtype="string")})

miscarriages = merge(DF_ID_name, miscarriages, left_on="Region", right_on="Territorio")

miscarriages.drop(["Territorio"], axis=1, inplace=True)
miscarriages.rename(columns = {"Value" : "Miscarriages"}, inplace=True)

miscarriages


Unnamed: 0,ITTER107,Region,Miscarriages
0,ITC1,Piemonte,309
1,ITC2,Valle d'Aosta / Vallée d'Aoste,9
2,ITC3,Liguria,92
3,ITC4,Lombardia,599
4,ITDA,Trentino Alto Adige / Südtirol,50
5,ITD3,Veneto,287
6,ITD4,Friuli-Venezia Giulia,75
7,ITD5,Emilia-Romagna,234
8,ITE1,Toscana,221
9,ITE2,Umbria,41


In [84]:
# aggregating age groups for ABORTIONS

abortions = abortions[["Territorio dell'evento", "Value"]]
abortions = abortions.groupby("Territorio dell'evento", as_index=False).sum()

region_code = []
region_name = []

for key, value in ID_name_dict.items():
    region_name.append(value)
    region_code.append(key)

DF_ID_name = DataFrame({"ITTER107" : Series(region_code,  dtype="string"),  "Region": Series(region_name,  dtype="string")})
abortions = merge(DF_ID_name, abortions, left_on="Region", right_on="Territorio dell'evento")

abortions.drop(["Territorio dell'evento"], axis=1, inplace=True)
abortions.rename(columns = { "Value" : "Abortions"}, inplace=True)

abortions


Unnamed: 0,ITTER107,Region,Abortions
0,ITC1,Piemonte,1670
1,ITC2,Valle d'Aosta / Vallée d'Aoste,46
2,ITC3,Liguria,664
3,ITC4,Lombardia,3381
4,ITDA,Trentino Alto Adige / Südtirol,326
5,ITD3,Veneto,1116
6,ITD4,Friuli-Venezia Giulia,343
7,ITD5,Emilia-Romagna,1644
8,ITE1,Toscana,1310
9,ITE2,Umbria,275


Now that all my datataset's age class has been aggregated I proceed to merge them based on region's value and I calculate the total pregnancies from the value column

In [85]:
PregnancyDS = merge(live_births, miscarriages, left_on= "ITTER107", right_on="ITTER107")
PregnancyDS = merge(PregnancyDS, abortions, left_on= "ITTER107", right_on="ITTER107", how="left") #how="left" is because I don'7 have lazio's value rn

total = PregnancyDS.sum(axis=1, numeric_only=True) # sums all pregnancies value per region
PregnancyDS["Total"] = total

PregnancyDS = PregnancyDS[["ITTER107", "Region", "Live_births", "Miscarriages", "Abortions", "Total"]]
PregnancyDS.to_csv("../data/mashupDS/PregnancyDS.csv", index=False)
PregnancyDS

Unnamed: 0,ITTER107,Region,Live_births,Miscarriages,Abortions,Total
0,ITC1,Piemonte,5448,309,1670,7427
1,ITC2,Valle d'Aosta / Vallée d'Aoste,142,9,46,197
2,ITC3,Liguria,1794,92,664,2550
3,ITC4,Lombardia,13468,599,3381,17448
4,ITD3,Veneto,5944,287,1116,7347
5,ITD4,Friuli-Venezia Giulia,1342,75,343,1760
6,ITD5,Emilia-Romagna,6016,234,1644,7894
7,ITDA,Trentino Alto Adige / Südtirol,1776,50,326,2152
8,ITE1,Toscana,4392,221,1310,5923
9,ITE2,Umbria,1056,41,275,1372
