MASHUP DATASET B - PREGNANCY
<br>
This dataset aggregates observed values in Dataset 5 (live births), Dataset 6 (miscarriages) and Dataset 7 (induced abortions) in order to retreive absolute values of pregnancies whithin our class-age of interest, divided by region of event, regardless of how they terminated.
<br><br>
Age values will be aggregated to create an age span that goes from 15 years old to 24 years old. 
<br><br>
<b>N.B.</b> <span style="color: blue;">Dataset 5</span> does not have values for a class age of 15 -24 as the other two datasets have, instead, it gives values of live births aggregated for a class of women <span style="color: blue;">until 17 years old</span>. However, we considered valid to aggregate numbers from this dataset without further distinction of age, since values for pregnancies under 15 years old are very close to 0, hence irrelevant.
<br>
For this reason, since in <span style="color: blue;">Dataset 7</span> was also possible to keep values for individuals under 15 years old, we have decided to keep them too.

In [26]:
from pandas import *
import re

# reading the datasets 

pathD4 = "../data/srcDS/D4Pregnancy/cleanedDS/cleanedD4-2019-Pregnancy.csv"
live_births = read_csv(pathD4, keep_default_na=False,
            dtype= {
                "RESIDENCE_TERR":"string",
                "CITIZENSHIP_MOTHER": "string",
                "MOTHER_AGE" :"string",
                "OBS_VALUE" : "int64"
            })

pathD5 = "../data/srcDS/D5Pregnancy/cleanedDS/cleanedD5-2019-Pregnancy.csv"
miscarriages = read_csv(pathD5, keep_default_na=False,
             dtype= {
                "Territorio":"string",
                "Classe di età": "string",
                "Value" : "int64"
            })

pathD6 = "../data/srcDS/D6Pregnancy/cleanedDS/cleanedD6-2019-Pregnancy.csv"
abortions = read_csv(pathD6, keep_default_na=False,
             dtype= {
                "Territorio dell'evento":"string",
                "Età e classe di età": "string",
                "Value" : "int64"
            })

# aggregating age groups for LIVE BIRTHS

live_births = live_births.query("MOTHER_AGE != 'Y25'") 
live_births = live_births.query("RESIDENCE_TERR != 'ITD1'")
live_births = live_births.query("RESIDENCE_TERR != 'ITD2'") # create a df without data for Y25
live_births = live_births[["RESIDENCE_TERR", "OBS_VALUE"]] #take out the column with mother_age so I can sum all of the values per region
live_births = live_births.groupby("RESIDENCE_TERR", as_index=False).sum()


# create a dictionary to map every region ID with the Region's name
ID_name_dict = { "ITC1" : "Piemonte",
                "ITC2" : "Valle d'Aosta / Vallée d'Aoste",
                "ITC3" : "Liguria",
                "ITC4" : "Lombardia",
                "ITDA" : "Trentino Alto Adige / Südtirol",
                "ITD3" : "Veneto",
                "ITD4" : "Friuli-Venezia Giulia",
                "ITD5" : "Emilia-Romagna",
                "ITE1" : "Toscana",
                "ITE2" : "Umbria",
                "ITE3" : "Marche",
                "ITE4" : "Lazio",
                "ITF1" : "Abruzzo",
                "ITF2" : "Molise",
                "ITF3" : "Campania",
                "ITF4" : "Puglia",
                "ITF5" : "Basilicata",
                "ITF6" : "Calabria",
                "ITG1" : "Sicilia",
                "ITG2" : "Sardegna"
                }


i = 0

# add NL Region name to dataset

region_name = []

for idx, row in live_births.iterrows():
    region_name.append(ID_name_dict[live_births.at[idx, "RESIDENCE_TERR"]])

live_births["Region"] = region_name #maybe try inserting at idx 1 ??
live_births.rename(columns = {"RESIDENCE_TERR" : "ITTER107", "OBS_VALUE" : "Live_births"}, inplace=True)

# function to add year

def addyear(pathyear, df):
    if "Time" not in df.columns[0]:
        year2017 = re.search("2017", pathyear)
        year2018 = re.search("2018", pathyear)
        year2019 = re.search("2019", pathyear)

        years = [year2017, year2018, year2019]
        for el in years:
            if el is not None:
                yr = el.group()
                df["Time"] = yr
                return yr
        return False

yr = addyear(pathD4, live_births)
live_births


Unnamed: 0,ITTER107,Live_births,Region,Time
0,ITC1,4758,Piemonte,2019
1,ITC2,136,Valle d'Aosta / Vallée d'Aoste,2019
2,ITC3,1600,Liguria,2019
3,ITC4,12100,Lombardia,2019
4,ITD3,5300,Veneto,2019
5,ITD4,1224,Friuli-Venezia Giulia,2019
6,ITD5,5296,Emilia-Romagna,2019
7,ITDA,1642,Trentino Alto Adige / Südtirol,2019
8,ITE1,3762,Toscana,2019
9,ITE2,908,Umbria,2019


In [27]:
# aggregating age groups for MISCARRIAGES
miscarriages = miscarriages[["Territorio", "Value"]]
miscarriages = miscarriages.groupby("Territorio", as_index=False).sum()

region_code = []
region_name = []

for key, value in ID_name_dict.items():
    region_name.append(value)
    region_code.append(key)

DF_ID_name = DataFrame({"ITTER107" : Series(region_code,  dtype="string"),  "Region": Series(region_name,  dtype="string")})

miscarriages = merge(DF_ID_name, miscarriages, left_on="Region", right_on="Territorio")

miscarriages.drop(["Territorio"], axis=1, inplace=True)
miscarriages.rename(columns = {"Value" : "Miscarriages"}, inplace=True)

yr = addyear(pathD5, miscarriages)
miscarriages


Unnamed: 0,ITTER107,Region,Miscarriages,Time
0,ITC1,Piemonte,564,2019
1,ITC2,Valle d'Aosta / Vallée d'Aoste,11,2019
2,ITC3,Liguria,240,2019
3,ITC4,Lombardia,1349,2019
4,ITDA,Trentino Alto Adige / Südtirol,199,2019
5,ITD3,Veneto,839,2019
6,ITD4,Friuli-Venezia Giulia,250,2019
7,ITD5,Emilia-Romagna,665,2019
8,ITE1,Toscana,624,2019
9,ITE2,Umbria,109,2019


In [28]:
# aggregating age groups for ABORTIONS

abortions = abortions[["Territorio dell'evento", "Value"]]
abortions = abortions.groupby("Territorio dell'evento", as_index=False).sum()

region_code = []
region_name = []

for key, value in ID_name_dict.items():
    region_name.append(value)
    region_code.append(key)

DF_ID_name = DataFrame({"ITTER107" : Series(region_code,  dtype="string"),  "Region": Series(region_name,  dtype="string")})
abortions = merge(DF_ID_name, abortions, left_on="Region", right_on="Territorio dell'evento")

abortions.drop(["Territorio dell'evento"], axis=1, inplace=True)
abortions.rename(columns = { "Value" : "Abortions"}, inplace=True)
yr = addyear(pathD6, abortions)

abortions


Unnamed: 0,ITTER107,Region,Abortions,Time
0,ITC1,Piemonte,1503,2019
1,ITC2,Valle d'Aosta / Vallée d'Aoste,27,2019
2,ITC3,Liguria,597,2019
3,ITC4,Lombardia,3059,2019
4,ITDA,Trentino Alto Adige / Südtirol,268,2019
5,ITD3,Veneto,1010,2019
6,ITD4,Friuli-Venezia Giulia,353,2019
7,ITD5,Emilia-Romagna,1441,2019
8,ITE1,Toscana,1197,2019
9,ITE2,Umbria,205,2019


Now that all my datataset's age class has been aggregated I proceed to merge them based on region's value and I calculate the total pregnancies from the value column

In [29]:
PregnancyDS = merge(live_births, miscarriages, left_on= "ITTER107", right_on="ITTER107")
PregnancyDS = merge(PregnancyDS, abortions, left_on= "ITTER107", right_on="ITTER107", how="left") #how="left" is because I don'7 have lazio's value rn

total = PregnancyDS.sum(axis=1, numeric_only=True) # sums all pregnancies value per region
PregnancyDS["Total"] = total
PregnancyDS = PregnancyDS[["ITTER107", "Region", "Live_births", "Miscarriages", "Abortions", "Total", "Time"]]
PregnancyDS.to_csv("../data/mashupDS/MD2-ASS-" + yr + ".csv", index=False)
PregnancyDS

Unnamed: 0,ITTER107,Region,Live_births,Miscarriages,Abortions,Total,Time
0,ITC1,Piemonte,4758,564,1503,6825,2019
1,ITC2,Valle d'Aosta / Vallée d'Aoste,136,11,27,174,2019
2,ITC3,Liguria,1600,240,597,2437,2019
3,ITC4,Lombardia,12100,1349,3059,16508,2019
4,ITD3,Veneto,5300,839,1010,7149,2019
5,ITD4,Friuli-Venezia Giulia,1224,250,353,1827,2019
6,ITD5,Emilia-Romagna,5296,665,1441,7402,2019
7,ITDA,Trentino Alto Adige / Südtirol,1642,199,268,2109,2019
8,ITE1,Toscana,3762,624,1197,5583,2019
9,ITE2,Umbria,908,109,205,1222,2019
