In [24]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import sqlalchemy as sf
import psycopg2

In [25]:
from sql_functions import get_engine
from sql_functions import get_dataframe
from sql_functions import get_sql_config

Step 1: Import file and create dataframe with individual variable

In [26]:

denmark = pd.read_excel("data/Seafood_Consumption_data/eumofa/denmark_yearly_consumption_volume_value_price.xlsx")
france = pd.read_excel("data/Seafood_Consumption_data/eumofa/france_yearly_consumption_volume_value_price.xlsx")
germany = pd.read_excel("data/Seafood_Consumption_data/eumofa/germany_yearly_consumption_volume_value_price.xlsx")
italy = pd.read_excel("data/Seafood_Consumption_data/eumofa/italy_yearly_consumption_volume_value_price.xlsx")
netherlands = pd.read_excel("data/Seafood_Consumption_data/eumofa/netherlands_yearly_consumption_volume_value_price.xlsx")
portugal = pd.read_excel("data/Seafood_Consumption_data/eumofa/portugal_yearly_consumption_volume_value_price.xlsx")
spain = pd.read_excel("data/Seafood_Consumption_data/eumofa/spain_yearly_consumption_volume_value_price.xlsx")
sweden = pd.read_excel("data/Seafood_Consumption_data/eumofa/sweden_yearly_consumption_volume_value_price.xlsx")

In [27]:

all_countries = [denmark,france,germany,italy,netherlands,portugal,spain,sweden]

countries = pd.concat(all_countries)

In [28]:
countries.tail()

Unnamed: 0,country,commodity_group,species,year,value_1000_eur,volume_t,eur_kg
5,sweden,Salmonids,Salmon,2017.0,4753.38,66042.9,13.8939
6,sweden,Salmonids,Salmon,2018.0,4888.38,59411.4,12.1536
7,sweden,Salmonids,Salmon,2019.0,5687.85,68064.1,11.9666
8,sweden,Salmonids,Salmon,2020.0,8335.81,91509.1,10.9778
9,sweden,Salmonids,Salmon,2021.0,7415.03,80969.4,10.9196


In [29]:
countries = countries.rename(columns={'value_1000_eur': 'volume_t', 'volume_t': 'value_1000_eur'})
countries.head()

Unnamed: 0,country,commodity_group,species,year,volume_t,value_1000_eur,eur_kg
0,Denmark,Salmonids,Salmon,2010.0,2355.43,46318.2,19.6645
1,Denmark,Salmonids,Salmon,2011.0,2446.35,49184.6,20.1053
2,Denmark,Salmonids,Salmon,2012.0,3206.73,58682.7,18.2999
3,Denmark,Salmonids,Salmon,2013.0,2994.76,56390.7,18.8298
4,Denmark,Salmonids,Salmon,2014.0,3148.2,64577.2,20.5125


Step 5: Rename row value names. First letter upper case e.g. EUR = Eur

In [30]:
countries['country'] = countries['country'].str.capitalize()
countries['commodity_group'] = countries['commodity_group'].str.capitalize()
countries['species'] = countries['species'].str.capitalize()

Step 7: Inspect Null Values / NaNs and datatypes with info()  

In [31]:
# no null values detected
countries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 0 to 9
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          126 non-null    object 
 1   commodity_group  126 non-null    object 
 2   species          126 non-null    object 
 3   year             126 non-null    float64
 4   volume_t         126 non-null    float64
 5   value_1000_eur   126 non-null    float64
 6   eur_kg           126 non-null    float64
dtypes: float64(4), object(3)
memory usage: 7.9+ KB


Step 9: Change data types if necessary:   
* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals

In [32]:
# convert 'year' into integer
countries = countries.astype({'year':'int'})
print(countries.dtypes)

country             object
commodity_group     object
species             object
year                 int64
volume_t           float64
value_1000_eur     float64
eur_kg             float64
dtype: object


In [33]:
# convert 'value_1000_eur' into 'value_eur'
countries['value_1000_eur'] = countries['value_1000_eur'].multiply(1000)
countries = countries.rename(columns={'value_1000_eur': 'value_eur'})
countries.value_eur = countries.value_eur.round(2)

In [34]:
# round eur_kg to two decimals
countries.eur_kg = countries.eur_kg.round(2)

Step 10: Species cleaning  
* aggregate species = salmon, tuna, lobster, shrimp  
* sum up species in categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon

In [35]:
# aggregate species
countries['species'] = countries['species'].replace(['Shrimp, miscellaneous','Shrimp crangon spp','Tuna, miscellaneous'],['Shrimp','Shrimp','Tuna'])

In [36]:
# aggregate category_species
countries['commodity_group'] = countries['commodity_group'].replace(['Tuna and tuna-like species','Salmonids'],['Pelagic fish','Pelagic fish'])

Step 12: Punctuation = decimals separator: comma, thousands separator: dot 

In [37]:
#countries['value_eur'] = [x.replace('.', ',') for x in countries['value_eur']]

In [38]:
print(countries)

    country commodity_group species  year  volume_t   value_eur  eur_kg
0   Denmark    Pelagic fish  Salmon  2010   2355.43  46318200.0   19.66
1   Denmark    Pelagic fish  Salmon  2011   2446.35  49184600.0   20.11
2   Denmark    Pelagic fish  Salmon  2012   3206.73  58682700.0   18.30
3   Denmark    Pelagic fish  Salmon  2013   2994.76  56390700.0   18.83
4   Denmark    Pelagic fish  Salmon  2014   3148.20  64577200.0   20.51
..      ...             ...     ...   ...       ...         ...     ...
5    Sweden    Pelagic fish  Salmon  2017   4753.38  66042900.0   13.89
6    Sweden    Pelagic fish  Salmon  2018   4888.38  59411400.0   12.15
7    Sweden    Pelagic fish  Salmon  2019   5687.85  68064100.0   11.97
8    Sweden    Pelagic fish  Salmon  2020   8335.81  91509100.0   10.98
9    Sweden    Pelagic fish  Salmon  2021   7415.03  80969400.0   10.92

[126 rows x 7 columns]


In [39]:
countries.country.unique() 

array(['Denmark', 'France', 'Germany', 'Italy', 'Netherlands', 'Portugal',
       'Spain', 'Sweden'], dtype=object)

In [40]:
##eumofa_countries_fishconsumption

# Import get_engine from sql_functions.py. You will need to restart your kernel and rerun at this point since we changed the module since we first imported it.
from sql_functions import get_engine
# create a variable called engine using the get_engine function
engine = get_engine()

# Set the schema to your capstone schema and the table_name variable to 'fish_catch' etc + your initials/group number

schema = 'capstone_fish_are_friends' # capstone schema example capstone_fish_are_friends
table_name = 'eumofa_countries_fishconsumption' # Example: 'carriers_pw' for Philipp Wendt


# Write records stored in a dataframe to SQL database using to_sql() function
if engine!=None:
   try:
      countries.to_sql(name=table_name, # Name of SQL table variable
                        con=get_engine(), # Engine or connection
                        schema=schema, # your class schema variable
                        if_exists='replace', # Drop the table before inserting new values 
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
      print(f"The {table_name} table was imported successfully.")
    # Error handling
   except (Exception, psycopg2.DatabaseError) as error:
      print(error)
      engine = None
else:
   print('No engine')

(psycopg2.OperationalError) could not translate host name "host" to address: nodename nor servname provided, or not known

(Background on this error at: https://sqlalche.me/e/14/e3q8)
