MASHUP DATASET B - PREGNANCY
<br>
This dataset aggregates observed values in Dataset 5 (live births), Dataset 6 (miscarriages) and Dataset 7 (induced abortions) in order to retreive absolute values of pregnancies whithin our class-age of interest, divided by region of event, regardless of how they terminated.
<br><br>
Age values will be aggregated to create an age span that goes from 15 years old to 24 years old. 
<br><br>
<b>N.B.</b> <span style="color: blue;">Dataset 5</span> does not have values for a class age of 15 -24 as the other two datasets have, instead, it gives values of live births aggregated for a class of women <span style="color: blue;">until 17 years old</span>. However, we considered valid to aggregate numbers from this dataset without further distinction of age, since values for pregnancies under 15 years old are very close to 0, hence irrelevant.
<br>
For this reason, since in <span style="color: blue;">Dataset 7</span> was also possible to keep values for individuals under 15 years old, we have decided to keep them too.

In [1]:
from pandas import *

# reading the datasets 

live_births = read_csv("../data/srcDS/D4Pregnancy/cleanedDS/cleanedD4-2017-Pregnancy.csv", keep_default_na=False,
            dtype= {
                "RESIDENCE_TERR":"string",
                "CITIZENSHIP_MOTHER": "string",
                "MOTHER_AGE" :"string",
                "OBS_VALUE" : "int64"
            })
miscarriages = read_csv("../data/srcDS/D5Pregnancy/cleanedDS/cleanedD5-2017-Pregnancy.csv", keep_default_na=False,
             dtype= {
                "Territorio":"string",
                "Classe di età": "string",
                "Value" : "int64"
            })
abortions = read_csv("../data/srcDS/D6Pregnancy/cleanedDS/cleanedD6-2017-Pregnancy.csv", keep_default_na=False,
             dtype= {
                "Territorio dell'evento":"string",
                "Età e classe di età": "string",
                "Value" : "int64"
            })

# aggregating age groups for LIVE BIRTHS

live_births = live_births.query("MOTHER_AGE != 'Y25'") 
live_births = live_births.query("RESIDENCE_TERR != 'ITD1'")
live_births = live_births.query("RESIDENCE_TERR != 'ITD2'") # create a df without data for Y25
live_births = live_births[["RESIDENCE_TERR", "OBS_VALUE"]] #take out the column with mother_age so I can sum all of the values per region
live_births = live_births.groupby("RESIDENCE_TERR", as_index=False).sum()


# create a dictionary to map every region ID with the Region's name
ID_name_dict = { "ITC1" : "Piemonte",
                "ITC2" : "Valle d'Aosta / Vallée d'Aoste",
                "ITC3" : "Liguria",
                "ITC4" : "Lombardia",
                "ITDA" : "Trentino Alto Adige / Südtirol",
                "ITD3" : "Veneto",
                "ITD4" : "Friuli-Venezia Giulia",
                "ITD5" : "Emilia-Romagna",
                "ITE1" : "Toscana",
                "ITE2" : "Umbria",
                "ITE3" : "Marche",
                "ITE4" : "Lazio",
                "ITF1" : "Abruzzo",
                "ITF2" : "Molise",
                "ITF3" : "Campania",
                "ITF4" : "Puglia",
                "ITF5" : "Basilicata",
                "ITF6" : "Calabria",
                "ITG1" : "Sicilia",
                "ITG2" : "Sardegna"
                }


i = 0

# substitute every region ID with the Region's name and change col name

while i < len(live_births):
    live_births.at[i,'RESIDENCE_TERR'] = ID_name_dict[live_births.at[i, "RESIDENCE_TERR"]]
    i += 1

live_births.rename(columns = {"RESIDENCE_TERR" : "Territorio", "OBS_VALUE" : "live_births"}, inplace=True)

live_births


Unnamed: 0,Territorio,live_births
0,Piemonte,5448
1,Valle d'Aosta / Vallée d'Aoste,142
2,Liguria,1794
3,Lombardia,13468
4,Veneto,5944
5,Friuli-Venezia Giulia,1342
6,Emilia-Romagna,6016
7,Trentino Alto Adige / Südtirol,1776
8,Toscana,4392
9,Umbria,1056


In [2]:
# aggregating age groups for MISCARRIAGES
# this is the only dataset for which we don't have data below 15 years old

miscarriages = miscarriages[["Territorio", "Value"]]
miscarriages = miscarriages.groupby("Territorio", as_index=False).sum()

miscarriages.rename(columns = {"Value" : "Miscarriages"}, inplace=True)

miscarriages


Unnamed: 0,Territorio,Miscarriages
0,Abruzzo,61
1,Basilicata,42
2,Calabria,154
3,Campania,306
4,Emilia-Romagna,234
5,Friuli-Venezia Giulia,75
6,Lazio,440
7,Liguria,92
8,Lombardia,599
9,Marche,96


In [3]:
# aggregating age groups for MISCARRIAGES
# MI SONO DIMENTICATA IL LAZIO PORCODDIO

for idx, row in abortions.iterrows():  # clean from unwanted values
    if abortions.at[idx, "Età e classe di età"] == "non indicato":
        abortions.drop(idx)
    elif abortions.at[idx, "Età e classe di età"] == "25-29anni":
        abortions.drop(idx)

abortions = abortions[["Territorio dell'evento", "Value"]]
abortions = abortions.groupby("Territorio dell'evento", as_index=False).sum()

abortions.rename(columns = {"Territorio dell'evento" : "Territorio", "Value" : "Abortions"}, inplace=True)

abortions


Unnamed: 0,Territorio,Abortions
0,Abruzzo,697
1,Basilicata,221
2,Calabria,891
3,Campania,3296
4,Emilia-Romagna,3151
5,Friuli-Venezia Giulia,596
6,Liguria,1176
7,Lombardia,6250
8,Marche,678
9,Molise,172


Now that all my datataset's age class has been aggregated I proceed to merge them based on region's value and I calculate the total pregnancies from the value column

In [4]:
PregnancyDS = merge(live_births, miscarriages, left_on= "Territorio", right_on="Territorio")
PregnancyDS = merge(PregnancyDS, abortions, left_on= "Territorio", right_on="Territorio", how="left") #how="left" is because I don'7 have lazio's value rn

total = PregnancyDS.sum(axis=1, numeric_only=True) # sums all pregnancies value per region
PregnancyDS["Total"] = total

PregnancyDS.to_csv("../data/mashupDS/PregnancyDS.csv", index=False)

PregnancyDS

# N.B. rn Abortions dtype is float because it has a NaN value, which is missing Lazio, I don't bother correcting it since I'll download a new correct database

Unnamed: 0,Territorio,live_births,Miscarriages,Abortions,Total
0,Piemonte,5448,309,3024.0,8781.0
1,Valle d'Aosta / Vallée d'Aoste,142,9,80.0,231.0
2,Liguria,1794,92,1176.0,3062.0
3,Lombardia,13468,599,6250.0,20317.0
4,Veneto,5944,287,2109.0,8340.0
5,Friuli-Venezia Giulia,1342,75,596.0,2013.0
6,Emilia-Romagna,6016,234,3151.0,9401.0
7,Trentino Alto Adige / Südtirol,1776,50,594.0,2420.0
8,Toscana,4392,221,2463.0,7076.0
9,Umbria,1056,41,539.0,1636.0
