In [77]:
import pandas as pd
from ETLPipeline.extract.get_data_from_kaggle import download_crime_dataset, download_mental_health_dataset
from ETLPipeline.extract.get_data_from_file import get_gapminder_data_from_file
from ETLPipeline.transform.transform_crime_mental_health_dataframes import transform_crime_mental_health_dataframes
from ETLPipeline.load.mysql import load_dataframe_to_mysql

In [78]:
gapminder_df = get_gapminder_data_from_file()

In [None]:
gapminder_df = gapminder_df.drop(columns=["rownames"])

In [80]:
entities = {'Grenada', 'Barbados', 'Kenya', 'Sweden', 'Bosnia and Herzegovina', 'Lithuania', 'Mongolia', 'Norway', 'Serbia', 'Montenegro', 'Mexico', 'New Zealand', 'Nicaragua', 'Austria', 'Solomon Islands', 'Belize', 'Chile', 'Slovakia', 'Bahrain', 'Switzerland', 'Saint Vincent and the Grenadines', 'Hungary', 'Burundi', 'Kuwait', 'India', 'Guyana', 'Australia', 'Sierra Leone', 'Colombia', 'Poland', 'Iceland', 'Germany', 'Portugal', 'Ukraine', 'Paraguay', 'Trinidad and Tobago', 'Oman', 'Algeria', 'Morocco', 'South Korea', 'Canada', 'Netherlands', 'Philippines', 'Finland', 'Moldova', 'Turkey', 'Spain', 'Argentina', 'Czech Republic', 'Brazil', 'Guatemala', 'Kyrgyzstan', 'Egypt', 'Mozambique', 'Mauritius', 'Russia', 'Armenia', 'Yemen', 'Albania', 'Honduras', 'Sao Tome and Principe', 'Panama', 'Romania', 'Azerbaijan', 'Belarus', 'Latvia', 'Uganda', 'Bulgaria', 'Ireland', 'Greece', 'Slovenia', 'Costa Rica', 'Georgia', 'Malta', "Cote d'Ivoire", 'Belgium', 'Cyprus', 'Syria', 'Maldives', 'France', 'Guinea', 'Kazakhstan', 'Estonia', 'Bahamas', 'Croatia', 'Israel', 'Luxembourg', 'Bolivia', 'El Salvador', 'Japan', 'Italy', 'Jamaica', 'Singapore', 'Tajikistan'}

dates = {'Albania': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Algeria': {2006, 2007, 2008, 2009, 2010, 2011},
             'Argentina': {2007, 2008},
             'Armenia': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Australia': {2008, 2009, 2010, 2011},
             'Austria': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Azerbaijan': {2007, 2008, 2009, 2010},
             'Bahamas': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Bahrain': {2007, 2008},
             'Barbados': {2009},
             'Belarus': {2004, 2005, 2006, 2007, 2008, 2009},
             'Belgium': {2003, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Belize': {2009, 2010, 2011},
             'Bolivia': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Bosnia and Herzegovina': {2007, 2008, 2009, 2010, 2011},
             'Brazil': {2006, 2007, 2008, 2009, 2010, 2011},
             'Bulgaria': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Burundi': {2008, 2009, 2010, 2011},
             'Canada': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Chile': {2006, 2007, 2008, 2009, 2010, 2011},
             'Colombia': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Costa Rica': {2006, 2007, 2008, 2009, 2010, 2011},
             "Cote d'Ivoire": {2007, 2008},
             'Croatia': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Cyprus': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Czech Republic': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Egypt': {2006, 2007, 2008, 2009, 2010, 2011},
             'El Salvador': {2007, 2008, 2009, 2010, 2011},
             'Estonia': {2003, 2004, 2005, 2006, 2007, 2008, 2009},
             'Finland': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'France': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Georgia': {2004, 2005, 2006, 2007, 2008, 2009, 2010},
             'Germany': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Greece': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Grenada': {2005, 2006, 2007, 2008, 2009, 2010},
             'Guatemala': {2004, 2005, 2006, 2007, 2008, 2009},
             'Guinea': {2007},
             'Guyana': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Honduras': {2011},
             'Hungary': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Iceland': {2005, 2006, 2007, 2008},
             'India': {2004, 2005, 2006, 2007, 2008, 2009, 2010},
             'Ireland': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Israel': {2004, 2005, 2006, 2007, 2008, 2010, 2011},
             'Italy': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Jamaica': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Japan': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Kazakhstan': {2006, 2007, 2008, 2009, 2010, 2011},
             'Kenya': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Kuwait': {2004, 2005, 2006, 2007, 2008, 2009},
             'Kyrgyzstan': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2011},
             'Latvia': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010},
             'Lithuania': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Luxembourg': {2006, 2007, 2008, 2009, 2010, 2011},
             'Maldives': {2007, 2008},
             'Malta': {2007, 2008, 2009, 2010, 2011},
             'Mauritius': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Mexico': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Moldova': {2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Mongolia': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Montenegro': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Morocco': {2003, 2004, 2005, 2006, 2007, 2008, 2009},
             'Mozambique': {2004, 2005, 2006, 2007, 2008, 2009},
             'Netherlands': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'New Zealand': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Nicaragua': {2010},
             'Norway': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Oman': {2007, 2008},
             'Panama': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Paraguay': {2006, 2007, 2008, 2009, 2010, 2011},
             'Philippines': {2006, 2007, 2008, 2009, 2010, 2011},
             'Poland': {2007, 2008, 2009, 2010, 2011},
             'Portugal': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Romania': {2007, 2008},
             'Russia': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Saint Vincent and the Grenadines': {2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Sao Tome and Principe': {2006, 2007, 2008, 2009, 2010, 2011},
             'Serbia': {2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Sierra Leone': {2007, 2008},
             'Singapore': {2006, 2007, 2008, 2009, 2010, 2011},
             'Slovakia': {2006, 2007, 2008, 2009, 2010, 2011},
             'Slovenia': {2003,
              2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Solomon Islands': {2004, 2005, 2006, 2007, 2008},
             'South Korea': {2006, 2007, 2008, 2009, 2010, 2011},
             'Spain': {2007, 2008, 2009, 2010, 2011},
             'Sweden': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011},
             'Switzerland': {2009, 2010, 2011},
             'Syria': {2007, 2008},
             'Tajikistan': {2006, 2007, 2008, 2009, 2010, 2011},
             'Trinidad and Tobago': {2004,
              2005,
              2006,
              2007,
              2008,
              2009,
              2010,
              2011},
             'Turkey': {2003, 2004, 2005, 2006, 2007, 2008},
             'Uganda': {2005, 2006, 2007, 2008, 2009, 2010},
             'Ukraine': {2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010},
             'Yemen': {2005, 2006, 2007, 2008, 2009}}

In [81]:
gapminder_df = gapminder_df[gapminder_df["country"].isin(entities)]

In [82]:
gapminder_df = gapminder_df[gapminder_df.apply(lambda row: row["year"] in dates.get(row["country"], set()), axis=1)].reset_index(drop=True)

In [96]:
gapminder_df

Unnamed: 0,country,year,infant_mortality,life_expectancy,fertility,population,gdp,continent,region
0,Belgium,2003,4.2,78.5,1.70,10426169.0,2.396593e+11,Europe,Western Europe
1,Bulgaria,2003,15.5,72.3,1.25,7805041.0,1.483846e+10,Europe,Eastern Europe
2,Canada,2003,5.2,79.8,1.54,31596593.0,7.737140e+11,Americas,Northern America
3,Croatia,2003,6.3,75.1,1.38,4390291.0,2.464898e+10,Europe,Southern Europe
4,Czech Republic,2003,4.8,75.6,1.20,10211846.0,6.426426e+10,Europe,Eastern Europe
...,...,...,...,...,...,...,...,...,...
558,Spain,2011,3.8,82.0,1.47,46708366.0,7.171940e+11,Europe,Southern Europe
559,Sweden,2011,2.4,81.7,1.90,9462352.0,3.167986e+11,Europe,Northern Europe
560,Switzerland,2011,3.8,82.6,1.51,7925813.0,3.009384e+11,Europe,Western Europe
561,Tajikistan,2011,43.3,70.1,3.81,7753925.0,2.060715e+09,Asia,Central Asia
