In [56]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import sqlalchemy as sf
import psycopg2

In [57]:
from sql_functions import get_engine
from sql_functions import get_dataframe
from sql_functions import get_sql_config

# Info
<span style="color:orange">The population data for Great Britain for 2021 were not available at the eurostat homepage. Further data had to be downloaded at a British public database. There just data for 2020 was available.</span>

<span style="color:orange">TIn this notebook the data is cleaned and concatenated.</span>

Step 1: Import file and create dataframe with individual variable

In [58]:

population18_70 = pd.read_csv("data/population18_70.csv")
population_gb = pd.read_csv("data/greatbritain_age_groups_2020.csv")


# First dataframe

Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [59]:
display(
population18_70.head(),
population_gb.head())

Unnamed: 0,TIME,GEO,AGE,SEX,UNIT,Value,Flag and Footnotes
0,2021,European Union - 27 countries (from 2020),18 years,Total,Number,4598262,ep
1,2021,European Union - 27 countries (from 2020),19 years,Total,Number,4676349,ep
2,2021,European Union - 27 countries (from 2020),20 years,Total,Number,4820916,ep
3,2021,European Union - 27 countries (from 2020),21 years,Total,Number,4771810,ep
4,2021,European Union - 27 countries (from 2020),22 years,Total,Number,4789687,ep


Unnamed: 0,Ethnicity,Age 18 to 19,Age 20 to 24,Age 25 to 29,Age 30 to 34,Age 35 to 39,Age 40 to 44,Age 45 to 49,Age 50 to 54,Age 55 to 59,Age 60 to 64,Age 65 to 69,Total
0,All Ethnic groups,1460156,3807245,3836609,3683915,3732161,4099089,4100526,3601694,3183915,3377162,2674161,37556633
1,White: Total,1204800,3086824,3076127,2931879,3083346,3529082,3637280,3241624,2919608,3196687,2543742,32450999
2,English/Welsh/Scottish/Northern Irish/British,1151265,2843471,2639988,2505636,2778339,3295988,3448763,3088536,2793918,3081271,2450117,30077292
3,Irish,6124,22386,30133,31422,32529,39539,42240,39414,42029,47417,46745,379978
4,Gypsy or Irish Traveller,1847,4711,4391,4009,3947,4030,3728,2947,2165,1833,1281,34889


Step 3: Delete non necessary columns and rows

In [60]:
#For further research we just need the country and Value columns
population18_70 = population18_70[['GEO', 'Value']]
# For further research we just need selected countries
population18_70 = population18_70.loc[population18_70['GEO'].isin(['Austria','Denmark','France','Germany including former GDR','Italy','Netherlands','Poland','Romania','Spain'])]


Step 4: Rename columns lower case, snake case, spaces, delimiters  

In [61]:
population18_70['GEO'].unique()

array(['Denmark', 'Germany including former GDR', 'Spain', 'France',
       'Italy', 'Netherlands', 'Austria', 'Poland', 'Romania'],
      dtype=object)

In [62]:
population18_70['GEO'].str.strip()
population18_70['GEO'] = population18_70['GEO'].replace({'Germany including former GDR' : 'Germany'}, regex=True)
population18_70.columns = population18_70.columns.str.lower()


Convert value to integar

In [63]:
population18_70['value'] = population18_70['value'].str.replace(',', '')

population18_70 = population18_70.astype(int, errors='ignore')

In [64]:
population18_70 = population18_70.groupby('geo').sum().reset_index()

In [65]:

population18_70.head()

Unnamed: 0,geo,value
0,Austria,6218160
1,Denmark,3892210
2,France,43841505
3,Germany,56960325
4,Italy,40104428


# Second dataframe

In [66]:
population_gb.head()

Unnamed: 0,Ethnicity,Age 18 to 19,Age 20 to 24,Age 25 to 29,Age 30 to 34,Age 35 to 39,Age 40 to 44,Age 45 to 49,Age 50 to 54,Age 55 to 59,Age 60 to 64,Age 65 to 69,Total
0,All Ethnic groups,1460156,3807245,3836609,3683915,3732161,4099089,4100526,3601694,3183915,3377162,2674161,37556633
1,White: Total,1204800,3086824,3076127,2931879,3083346,3529082,3637280,3241624,2919608,3196687,2543742,32450999
2,English/Welsh/Scottish/Northern Irish/British,1151265,2843471,2639988,2505636,2778339,3295988,3448763,3088536,2793918,3081271,2450117,30077292
3,Irish,6124,22386,30133,31422,32529,39539,42240,39414,42029,47417,46745,379978
4,Gypsy or Irish Traveller,1847,4711,4391,4009,3947,4030,3728,2947,2165,1833,1281,34889


In [67]:
#For further research we just need the country and Value columns
population_gb = population_gb[['Ethnicity', 'Total']]
# For further research we just need selected countries
population_gb = population_gb.loc[population_gb['Ethnicity'].isin(['All Ethnic groups'])]

population_gb.head()

Unnamed: 0,Ethnicity,Total
0,All Ethnic groups,37556633


In [68]:
# covert the table into a concatable format
population_gb['Ethnicity'] = population_gb['Ethnicity'].replace({'All Ethnic groups' : 'United Kingdom'}, regex=True)

population_gb.rename(columns={'Ethnicity':'geo','Total':'value'}, inplace=True)

# Concat the two dataframes

In [69]:
population18_70 = pd.concat([population18_70,population_gb])

In [70]:
##population18_70

# Import get_engine from sql_functions.py. You will need to restart your kernel and rerun at this point since we changed the module since we first imported it.
from sql_functions import get_engine
# create a variable called engine using the get_engine function
engine = get_engine()

# Set the schema to your capstone schema and the table_name variable to 'fish_catch' etc + your initials/group number

schema = 'capstone_fish_are_friends' # capstone schema example capstone_fish_are_friends
table_name = 'population18_70' # Example: 'carriers_pw' for Philipp Wendt


# Write records stored in a dataframe to SQL database using to_sql() function
if engine!=None:
   try:
      population18_70.to_sql(name=table_name, # Name of SQL table variable
                        con=get_engine(), # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
      print(f"The {table_name} table was imported successfully.")
    # Error handling
   except (Exception, psycopg2.DatabaseError) as error:
      print(error)
      engine = None
else:
   print('No engine')

The population18_70 table was imported successfully.
