### <span style='color:lightblue'> Import all necessary packages</span> 

In [2]:
# Import all necessary packages

import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np
import psycopg2
import sqlalchemy

from sql_functions import get_dataframe
from sql_functions import get_engine
from sql_functions import get_sql_config

# Convert float Formats to 2 digits
pd.options.display.float_format = '{:,.2f}'.format

## <span style='color:green'> Total export by species in EU countries per year</span> 

### Cleaning Step 1: Import file and create dataframe with individual variable

In [3]:
# import csv file
trading_total_export = pd.read_excel("/Users/nina/neuefische/capstone_project/data/seafood_trading/total_export_by_species_countries_eu_yearly.xlsx")
trading_total_export.head()

Unnamed: 0,COUNTRY,MAIN COMMERCIAL SPECIES,FLOW TYPE,Measures,2004,2005,2006,2007,2008,2009,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,Totals
0,Austria,Lobster Homarus spp,Export,VOLUME (KG),0.0,200.0,100.0,200.0,100.0,100.0,...,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0,0.0,36643
1,,,,VALUE (EUR),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0,4000.0,772532
2,,,Totals,VOLUME (KG),0.0,200.0,100.0,200.0,100.0,100.0,...,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0,0.0,36643
3,,,,VALUE (EUR),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0,4000.0,772532
4,,Lobster Norway,Export,VOLUME (KG),,,300.0,300.0,300.0,0.0,...,443.0,354.0,1124.0,1467.0,743.0,1264.0,961.0,2000.0,0.0,13844


### Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [4]:
# inspect the dataframe
trading_total_export.info()
#trading_total_export.head()
#trading_total_export.shape
#trading_total_export.columns
#trading_total_export.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1678 entries, 0 to 1677
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   COUNTRY                  29 non-null     object 
 1   MAIN COMMERCIAL SPECIES  434 non-null    object 
 2   FLOW TYPE                839 non-null    object 
 3   Measures                 1678 non-null   object 
 4   2004                     1085 non-null   float64
 5   2005                     1152 non-null   float64
 6   2006                     1128 non-null   float64
 7   2007                     1236 non-null   float64
 8   2008                     1212 non-null   float64
 9   2009                     1240 non-null   float64
 10  2010                     1300 non-null   float64
 11  2011                     1264 non-null   float64
 12  2012                     1332 non-null   float64
 13  2013                     1342 non-null   float64
 14  2014                    

### Step 3 Rename columns lower case, snake case, spaces, delimiters &
### Step 4 Autofill Null Values when merged cell in excel/csv file is empty    

In [5]:
# Autofill NaN Values from merged cells with correct value 
trading_total_export[['COUNTRY','MAIN COMMERCIAL SPECIES','FLOW TYPE']] = trading_total_export[['COUNTRY','MAIN COMMERCIAL SPECIES','FLOW TYPE']].fillna(method='ffill', axis=0)
trading_total_export.head()

#Removing whitespace & rename columns with lower cases
cols = trading_total_export.columns.tolist()                                                                                                                           
cols = [str(col).replace(' ', '_').lower() for col in cols]
trading_total_export.columns = cols
trading_total_export.head()

Unnamed: 0,country,main_commercial_species,flow_type,measures,2004,2005,2006,2007,2008,2009,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,totals
0,Austria,Lobster Homarus spp,Export,VOLUME (KG),0.0,200.0,100.0,200.0,100.0,100.0,...,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0,0.0,36643
1,Austria,Lobster Homarus spp,Export,VALUE (EUR),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0,4000.0,772532
2,Austria,Lobster Homarus spp,Totals,VOLUME (KG),0.0,200.0,100.0,200.0,100.0,100.0,...,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0,0.0,36643
3,Austria,Lobster Homarus spp,Totals,VALUE (EUR),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0,4000.0,772532
4,Austria,Lobster Norway,Export,VOLUME (KG),,,300.0,300.0,300.0,0.0,...,443.0,354.0,1124.0,1467.0,743.0,1264.0,961.0,2000.0,0.0,13844


### Step 5 Delete non necessary columns and rows

In [6]:
# Delete unnecessary rows with "Totals" Values because they are duplicates

trading_total_export= trading_total_export.drop("2022", axis=1)
trading_total_export= trading_total_export.drop("totals", axis=1)

#Set "Main Commercial Species" as new index to be able to delete rows with "Total" Values
trading_total_export= trading_total_export.set_index("main_commercial_species")
trading_total_export= trading_total_export.drop("Totals", axis=0)

# Reset index and inplace it in dataframe to get back the old index scheme
trading_total_export.reset_index(inplace=True)

#Repeat with flow_types
trading_total_export= trading_total_export.set_index("flow_type")
trading_total_export= trading_total_export.drop("Totals", axis=0)
trading_total_export.reset_index(inplace=True)

### Step 6 Change data types if necessary:   
* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals 

In [7]:
# Convert column "totals" to float

#trading_total_export['totals'] = trading_total_export['totals'].astype(float)
trading_total_export.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   flow_type                810 non-null    object 
 1   main_commercial_species  810 non-null    object 
 2   country                  810 non-null    object 
 3   measures                 810 non-null    object 
 4   2004                     517 non-null    float64
 5   2005                     550 non-null    float64
 6   2006                     538 non-null    float64
 7   2007                     590 non-null    float64
 8   2008                     578 non-null    float64
 9   2009                     592 non-null    float64
 10  2010                     622 non-null    float64
 11  2011                     604 non-null    float64
 12  2012                     638 non-null    float64
 13  2013                     642 non-null    float64
 14  2014                     6

###  Step 7 Rename row value names. First letter upper case e.g. EUR = Eur

In [8]:
# Fill NaN Values in dataframe with 0
trading_total_export.fillna(0, inplace=True)
trading_total_export.head()

Unnamed: 0,flow_type,main_commercial_species,country,measures,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Export,Lobster Homarus spp,Austria,VOLUME (KG),0.0,200.0,100.0,200.0,100.0,100.0,...,7577.0,9069.0,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0
1,Export,Lobster Homarus spp,Austria,VALUE (EUR),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,85565.0,101996.0,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0
2,Export,Lobster Norway,Austria,VOLUME (KG),0.0,0.0,300.0,300.0,300.0,0.0,...,511.0,457.0,443.0,354.0,1124.0,1467.0,743.0,1264.0,961.0,2000.0
3,Export,Lobster Norway,Austria,VALUE (EUR),2141.0,0.0,5735.0,10899.0,6627.0,2093.0,...,8524.0,7801.0,6544.0,4123.0,16228.0,22326.0,10400.0,15720.0,11230.0,62000.0
4,Export,Rock lobster and sea crawfish,Austria,VOLUME (KG),100.0,0.0,100.0,2400.0,3500.0,400.0,...,2865.0,429.0,272.0,7756.0,9355.0,17594.0,13082.0,10981.0,2380.0,0.0


In [9]:
#Rename measure Values
trading_total_export = trading_total_export.replace(["VOLUME (KG)","VALUE (EUR)"],["Volume (Kg)","Value (Eur)"])
trading_total_export.head(10)

Unnamed: 0,flow_type,main_commercial_species,country,measures,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Export,Lobster Homarus spp,Austria,Volume (Kg),0.0,200.0,100.0,200.0,100.0,100.0,...,7577.0,9069.0,5881.0,3228.0,1606.0,2254.0,1997.0,1224.0,660.0,0.0
1,Export,Lobster Homarus spp,Austria,Value (Eur),2141.0,9582.0,15595.0,29985.0,19845.0,14870.0,...,85565.0,101996.0,74475.0,62215.0,46932.0,70405.0,60527.0,44103.0,30194.0,48000.0
2,Export,Lobster Norway,Austria,Volume (Kg),0.0,0.0,300.0,300.0,300.0,0.0,...,511.0,457.0,443.0,354.0,1124.0,1467.0,743.0,1264.0,961.0,2000.0
3,Export,Lobster Norway,Austria,Value (Eur),2141.0,0.0,5735.0,10899.0,6627.0,2093.0,...,8524.0,7801.0,6544.0,4123.0,16228.0,22326.0,10400.0,15720.0,11230.0,62000.0
4,Export,Rock lobster and sea crawfish,Austria,Volume (Kg),100.0,0.0,100.0,2400.0,3500.0,400.0,...,2865.0,429.0,272.0,7756.0,9355.0,17594.0,13082.0,10981.0,2380.0,0.0
5,Export,Rock lobster and sea crawfish,Austria,Value (Eur),6359.0,18476.0,17868.0,26083.0,13922.0,19887.0,...,44246.0,12706.0,9072.0,53655.0,74718.0,119327.0,95278.0,78688.0,18064.0,32000.0
6,Export,Salmon,Austria,Volume (Kg),48100.0,71000.0,147800.0,293700.0,294100.0,358200.0,...,876437.0,833928.0,846027.0,855575.0,889833.0,936454.0,1159429.0,1235316.0,1224399.0,1109000.0
7,Export,Salmon,Austria,Value (Eur),466828.0,635092.0,971574.0,2071971.0,2283900.0,2834298.0,...,8290263.0,10286945.0,9941101.0,10495328.0,11913412.0,13272272.0,17237229.0,18899922.0,20959029.0,18351000.0
8,Export,Shrimp Crangon spp,Austria,Volume (Kg),0.0,0.0,0.0,0.0,0.0,0.0,...,2025.0,876.0,198.0,325.0,23988.0,138699.0,87285.0,55412.0,38697.0,19000.0
9,Export,Shrimp Crangon spp,Austria,Value (Eur),1433.0,966.0,1168.0,538.0,1159.0,553.0,...,20782.0,12018.0,3433.0,5105.0,123847.0,650350.0,550759.0,344305.0,213004.0,126000.0


In [10]:
# Create new column year and put quantity values in it
# Convert column "year" to integer

trading_total_export = trading_total_export.melt(id_vars=["country", "main_commercial_species", "flow_type", "measures"], # Spalten die gleich bleiben sollen
                                                           var_name="year", # neue column
                                                         value_name="quantity") # welche Werte sollen zugeordnet werden
trading_total_export.head(20)


# Convert column "totals" to float
trading_total_export['year'] = trading_total_export['year'].astype(int)
trading_total_export.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14580 entries, 0 to 14579
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  14580 non-null  object 
 1   main_commercial_species  14580 non-null  object 
 2   flow_type                14580 non-null  object 
 3   measures                 14580 non-null  object 
 4   year                     14580 non-null  int64  
 5   quantity                 14580 non-null  float64
dtypes: float64(1), int64(1), object(4)
memory usage: 683.6+ KB


### Step 8 Replace (country/species) abbreviations with full names by using dictionaries 

### Step 9 Inspect Null Values / NaNs and datatypes with info()  

### Step 10 Species cleaning  

In [11]:
# Split Volume and Value data in two different columns

# Create two new dataframes to seperate element values
seafood_volume = trading_total_export[trading_total_export["measures"] == "Volume (Kg)"]
seafood_value = trading_total_export[trading_total_export["measures"] == "Value (Eur)"]

# Merge newly created dataframes 
vol_val_merge = seafood_volume.merge(seafood_value, how ="left", on = ["country","main_commercial_species","flow_type","year"])
vol_val_merge.head()

# Drop unnecessary columns
vol_val_merge.drop(['measures_x', 'measures_y'], axis=1, inplace=True)

# Rename columns quantity_x and quantity_y
vol_val_merge.rename(columns= {'quantity_x': 'volume_kg', 'quantity_y': 'value_eur'}, inplace=True)

#Overwrite new dataframe with old dataframe name
trading_total_export = vol_val_merge
trading_total_export.head(20)


Unnamed: 0,country,main_commercial_species,flow_type,year,volume_kg,value_eur
0,Austria,Lobster Homarus spp,Export,2004,0.0,2141.0
1,Austria,Lobster Norway,Export,2004,0.0,2141.0
2,Austria,Rock lobster and sea crawfish,Export,2004,100.0,6359.0
3,Austria,Salmon,Export,2004,48100.0,466828.0
4,Austria,Shrimp Crangon spp,Export,2004,0.0,1433.0
5,Austria,Shrimp coldwater,Export,2004,200.0,4972.0
6,Austria,Shrimp deep-water rose,Export,2004,0.0,0.0
7,Austria,Shrimp miscellaneous,Export,2004,5800.0,65879.0
8,Austria,Shrimp warmwater,Export,2004,100.0,588.0
9,Austria,Tuna albacore,Export,2004,5800.0,59056.0


In [12]:
#Calculate Kg to tonnes
trading_total_export["volume_t"] = (trading_total_export["volume_kg"] / 1000)


# Drop column "volume_kg"
trading_total_export.drop(columns= {'volume_kg'}, inplace=True)
trading_total_export

Unnamed: 0,country,main_commercial_species,flow_type,year,value_eur,volume_t
0,Austria,Lobster Homarus spp,Export,2004,2141.00,0.00
1,Austria,Lobster Norway,Export,2004,2141.00,0.00
2,Austria,Rock lobster and sea crawfish,Export,2004,6359.00,0.10
3,Austria,Salmon,Export,2004,466828.00,48.10
4,Austria,Shrimp Crangon spp,Export,2004,1433.00,0.00
...,...,...,...,...,...,...
7285,United Kingdom,Tuna bigeye,Export,2021,0.00,0.00
7286,United Kingdom,Tuna bluefin,Export,2021,0.00,0.00
7287,United Kingdom,Tuna miscellaneous,Export,2021,0.00,0.00
7288,United Kingdom,Tuna skipjack,Export,2021,0.00,0.00


In [13]:
# Display unique species
display(trading_total_export["main_commercial_species"].unique())

# Rename sub-species to main species
trading_total_export["main_commercial_species"]= trading_total_export["main_commercial_species"].replace(["Lobster Homarus spp", "Lobster Norway","Rock lobster and sea crawfish"],"Lobster")
trading_total_export["main_commercial_species"]= trading_total_export["main_commercial_species"].replace(["Shrimp Crangon spp", "Shrimp coldwater", "Shrimp deep-water rose", "Shrimp miscellaneous", "Shrimp warmwater"],"Shrimp")
trading_total_export["main_commercial_species"]= trading_total_export["main_commercial_species"].replace(["Tuna albacore", "Tuna bigeye", "Tuna bluefin", "Tuna miscellaneous", "Tuna skipjack", "Tuna skipjack", "Tuna yellowfin"],"Tuna")

display(trading_total_export["main_commercial_species"].unique())

array(['Lobster Homarus spp', 'Lobster Norway',
       'Rock lobster and sea crawfish', 'Salmon', 'Shrimp Crangon spp',
       'Shrimp coldwater', 'Shrimp deep-water rose',
       'Shrimp miscellaneous', 'Shrimp warmwater', 'Tuna albacore',
       'Tuna bigeye', 'Tuna bluefin', 'Tuna miscellaneous',
       'Tuna skipjack', 'Tuna yellowfin'], dtype=object)

array(['Lobster', 'Salmon', 'Shrimp', 'Tuna'], dtype=object)

In [14]:
# aggregate species to categories - categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon
# create list of conditions
conditions = [(trading_total_export["main_commercial_species"].str.contains("Salmon",case=False)),
            (trading_total_export["main_commercial_species"].str.contains("Tuna",case=False)), 
            (trading_total_export["main_commercial_species"].str.contains("Lobster",case=False)), 
            (trading_total_export["main_commercial_species"].str.contains("Shrimp",case=False))]
#create list of values we want to assign for each condition
values = ["Pelagic fish", "Pelagic fish", "Crustaceans", "Crustaceans"]

trading_total_export["categories_species"] = np.select(conditions, values)
trading_total_export.head()

Unnamed: 0,country,main_commercial_species,flow_type,year,value_eur,volume_t,categories_species
0,Austria,Lobster,Export,2004,2141.0,0.0,Crustaceans
1,Austria,Lobster,Export,2004,2141.0,0.0,Crustaceans
2,Austria,Lobster,Export,2004,6359.0,0.1,Crustaceans
3,Austria,Salmon,Export,2004,466828.0,48.1,Pelagic fish
4,Austria,Shrimp,Export,2004,1433.0,0.0,Crustaceans


In [15]:
#Drop column flow type because it is always export
trading_total_export.drop("flow_type",inplace=True,axis=1)
trading_total_export

Unnamed: 0,country,main_commercial_species,year,value_eur,volume_t,categories_species
0,Austria,Lobster,2004,2141.00,0.00,Crustaceans
1,Austria,Lobster,2004,2141.00,0.00,Crustaceans
2,Austria,Lobster,2004,6359.00,0.10,Crustaceans
3,Austria,Salmon,2004,466828.00,48.10,Pelagic fish
4,Austria,Shrimp,2004,1433.00,0.00,Crustaceans
...,...,...,...,...,...,...
7285,United Kingdom,Tuna,2021,0.00,0.00,Pelagic fish
7286,United Kingdom,Tuna,2021,0.00,0.00,Pelagic fish
7287,United Kingdom,Tuna,2021,0.00,0.00,Pelagic fish
7288,United Kingdom,Tuna,2021,0.00,0.00,Pelagic fish


### Step 11 Decoding data

### Step 12 Punctuation = decimals separator: comma, thousands separator: dot    

### Step 13 language = english  

### Last step Merging dataframes: tbd after Steps 1 - 13 are done

In [16]:
#Rearrange order of columns in dataframe

trading_total_export = trading_total_export[['year', 'country', 'main_commercial_species','categories_species','volume_t','value_eur']]
trading_total_export.head()

Unnamed: 0,year,country,main_commercial_species,categories_species,volume_t,value_eur
0,2004,Austria,Lobster,Crustaceans,0.0,2141.0
1,2004,Austria,Lobster,Crustaceans,0.0,2141.0
2,2004,Austria,Lobster,Crustaceans,0.1,6359.0
3,2004,Austria,Salmon,Pelagic fish,48.1,466828.0
4,2004,Austria,Shrimp,Crustaceans,0.0,1433.0


In [17]:
#final dataframes
trading_total_export

Unnamed: 0,year,country,main_commercial_species,categories_species,volume_t,value_eur
0,2004,Austria,Lobster,Crustaceans,0.00,2141.00
1,2004,Austria,Lobster,Crustaceans,0.00,2141.00
2,2004,Austria,Lobster,Crustaceans,0.10,6359.00
3,2004,Austria,Salmon,Pelagic fish,48.10,466828.00
4,2004,Austria,Shrimp,Crustaceans,0.00,1433.00
...,...,...,...,...,...,...
7285,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7286,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7287,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7288,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00


### Upload to DBeaver

In [18]:
# upload dataframe in dbeaver

table_name = 'total_export_by_country_and_species'
engine = get_engine()
schema = 'capstone_fish_are_friends'
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        trading_total_export.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The total_export_by_country_and_species table was imported successfully.


## <span style='color:green'> Total import by species in EU countries per year</span> 

### Cleaning Step 1: Import file and create dataframe with individual variable


In [19]:
trading_total_import = pd.read_excel("/Users/nina/neuefische/capstone_project/data/seafood_trading/total_import_by_species_countries_eu_yearly.xlsx")
#trading_total_import

### Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [20]:
# inspect the dataframe
trading_total_import.info()
#trading_total_import.head()
#trading_total_import.shape
#trading_total_import.columns
#trading_total_import.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1738 entries, 0 to 1737
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   COUNTRY                  29 non-null     object 
 1   MAIN COMMERCIAL SPECIES  449 non-null    object 
 2   FLOW TYPE                869 non-null    object 
 3   Measures                 1738 non-null   object 
 4   2004                     1372 non-null   float64
 5   2005                     1380 non-null   float64
 6   2006                     1388 non-null   float64
 7   2007                     1536 non-null   float64
 8   2008                     1540 non-null   float64
 9   2009                     1516 non-null   float64
 10  2010                     1584 non-null   float64
 11  2011                     1560 non-null   float64
 12  2012                     1608 non-null   float64
 13  2013                     1634 non-null   float64
 14  2014                    

### Step 3 Rename columns lower case, snake case, spaces, delimiters &
### Step 4 Autofill Null Values when merged cell in excel/csv file is empty    

In [21]:
# Autofill NaN Values from merged cells with correct value 
trading_total_import[['COUNTRY','MAIN COMMERCIAL SPECIES','FLOW TYPE']] = trading_total_import[['COUNTRY','MAIN COMMERCIAL SPECIES','FLOW TYPE']].fillna(method='ffill', axis=0)

#Removing whitespace & rename columns with lower cases
cols = trading_total_import.columns.tolist()                                                                                                                           
cols = [str(col).replace(' ', '_').lower() for col in cols]
trading_total_import.columns = cols

### Step 5 Delete non necessary columns and rows

In [22]:
# Delete unnecessary rows with "Totals" Values because they are duplicates

trading_total_import= trading_total_import.drop("2022", axis=1)
trading_total_import= trading_total_import.drop("totals", axis=1)

#Set "Main Commercial Species" as new index to be able to delete rows with "Total" Values
trading_total_import= trading_total_import.set_index("main_commercial_species")
trading_total_import= trading_total_import.drop("Totals", axis=0)

# Reset index and inplace it in dataframe to get back the old index scheme
trading_total_import.reset_index(inplace=True)

#Repeat with flow_types
trading_total_import= trading_total_import.set_index("flow_type")
trading_total_import= trading_total_import.drop("Totals", axis=0)
trading_total_import.reset_index(inplace=True)

#Drop column flow type because it is always export
trading_total_import.drop("flow_type",inplace=True,axis=1)
trading_total_import

Unnamed: 0,main_commercial_species,country,measures,2004,2005,2006,2007,2008,2009,2010,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Lobster Homarus spp,Austria,VOLUME (KG),68900.00,68100.00,78200.00,83200.00,89200.00,54400.00,89010.00,...,57727.00,58255.00,58024.00,54223.00,60992.00,63813.00,59107.00,78180.00,51169.00,39000.00
1,Lobster Homarus spp,Austria,VALUE (EUR),1165703.00,1343879.00,1532138.00,1719851.00,1402819.00,1125765.00,1410187.00,...,1120481.00,1165723.00,1133924.00,1176778.00,1449729.00,1668106.00,1398331.00,1692097.00,1177118.00,1282000.00
2,"Lobster, Norway",Austria,VOLUME (KG),13700.00,10900.00,22600.00,16300.00,15300.00,16700.00,28384.00,...,33898.00,18439.00,16639.00,21255.00,31264.00,20960.00,21657.00,27845.00,22338.00,21000.00
3,"Lobster, Norway",Austria,VALUE (EUR),166185.00,169256.00,257190.00,275827.00,240109.00,252468.00,352791.00,...,391203.00,238328.00,212879.00,248974.00,383086.00,320660.00,326180.00,518081.00,326046.00,515000.00
4,Rock lobster and sea crawfish,Austria,VOLUME (KG),19600.00,19800.00,21200.00,38800.00,33600.00,24500.00,29991.00,...,28383.00,32451.00,38742.00,39225.00,69016.00,71215.00,58365.00,59795.00,37338.00,63000.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,"Tuna, miscellaneous",United Kingdom,VALUE (EUR),280284972.00,296871966.00,306318987.00,333519246.00,424186163.00,330603043.00,328005757.00,...,449237946.00,516312611.00,436170229.00,69254044.00,41071748.00,28020230.00,22973926.00,35806564.00,2408584.00,
836,"Tuna, skipjack",United Kingdom,VOLUME (KG),438300.00,505500.00,586700.00,375600.00,226400.00,389400.00,219370.00,...,25992.00,23.00,40009.00,86611801.00,108448222.00,102577306.00,100946868.00,93836170.00,7494844.00,
837,"Tuna, skipjack",United Kingdom,VALUE (EUR),1991907.00,2290942.00,2729450.00,1619993.00,1181997.00,1644100.00,750356.00,...,38645.00,224.00,79398.00,333282699.00,385513803.00,421716341.00,447151758.00,416314917.00,31314733.00,
838,"Tuna, yellowfin",United Kingdom,VOLUME (KG),250800.00,419500.00,561600.00,590100.00,247300.00,223500.00,211222.00,...,386390.00,370416.00,488169.00,13345155.00,5493195.00,3639996.00,2641474.00,8746171.00,680765.00,


### Step 6 Change data types if necessary   

* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals 

In [23]:
# Convert column "totals" to float
#trading_total_import['totals'] = trading_total_import['totals'].astype(float)

###  Step 7 Rename row value names. First letter upper case e.g. EUR = Eur

In [24]:
#Rename measure Values
trading_total_import = trading_total_import.replace(["VOLUME (KG)","VALUE (EUR)"],["Volume (Kg)","Value (Eur)"])
trading_total_import.head()

Unnamed: 0,main_commercial_species,country,measures,2004,2005,2006,2007,2008,2009,2010,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Lobster Homarus spp,Austria,Volume (Kg),68900.0,68100.0,78200.0,83200.0,89200.0,54400.0,89010.0,...,57727.0,58255.0,58024.0,54223.0,60992.0,63813.0,59107.0,78180.0,51169.0,39000.0
1,Lobster Homarus spp,Austria,Value (Eur),1165703.0,1343879.0,1532138.0,1719851.0,1402819.0,1125765.0,1410187.0,...,1120481.0,1165723.0,1133924.0,1176778.0,1449729.0,1668106.0,1398331.0,1692097.0,1177118.0,1282000.0
2,"Lobster, Norway",Austria,Volume (Kg),13700.0,10900.0,22600.0,16300.0,15300.0,16700.0,28384.0,...,33898.0,18439.0,16639.0,21255.0,31264.0,20960.0,21657.0,27845.0,22338.0,21000.0
3,"Lobster, Norway",Austria,Value (Eur),166185.0,169256.0,257190.0,275827.0,240109.0,252468.0,352791.0,...,391203.0,238328.0,212879.0,248974.0,383086.0,320660.0,326180.0,518081.0,326046.0,515000.0
4,Rock lobster and sea crawfish,Austria,Volume (Kg),19600.0,19800.0,21200.0,38800.0,33600.0,24500.0,29991.0,...,28383.0,32451.0,38742.0,39225.0,69016.0,71215.0,58365.0,59795.0,37338.0,63000.0


In [25]:
trading_total_import = trading_total_import.melt(id_vars=["country", "main_commercial_species","measures"], # Spalten die gleich bleiben sollen
                                                           var_name="year", # neue column
                                                         value_name="quantity") # welche Werte sollen zugeordnet werden
trading_total_import

# Convert column "totals" to float
trading_total_import ['year'] = trading_total_import ['year'].astype(int)
trading_total_import.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   country                  15120 non-null  object 
 1   main_commercial_species  15120 non-null  object 
 2   measures                 15120 non-null  object 
 3   year                     15120 non-null  int64  
 4   quantity                 13704 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 590.8+ KB


In [26]:
# Split Volume and Value data in two different columns

# Create two new dataframes to seperate element values
seafood_volume1 = trading_total_import[trading_total_import["measures"] == "Volume (Kg)"]
seafood_value2 = trading_total_import[trading_total_import["measures"] == "Value (Eur)"]

# Merge newly created dataframes 
vol_val_merge1 = seafood_volume1.merge(seafood_value2, how ="left", on = ["country","main_commercial_species","year"])
vol_val_merge1.head()

# Drop unnecessary columns
vol_val_merge1.drop(['measures_x', 'measures_y'], axis=1, inplace=True)

# Rename columns quantity_x and quantity_y
vol_val_merge1.rename(columns= {'quantity_x': 'volume_kg', 'quantity_y': 'value_eur'}, inplace=True)

#Overwrite new dataframe with old dataframe name
trading_total_import = vol_val_merge1
trading_total_import.head(20)


Unnamed: 0,country,main_commercial_species,year,volume_kg,value_eur
0,Austria,Lobster Homarus spp,2004,68900.0,1165703.0
1,Austria,"Lobster, Norway",2004,13700.0,166185.0
2,Austria,Rock lobster and sea crawfish,2004,19600.0,198320.0
3,Austria,Salmon,2004,6342600.0,27836717.0
4,Austria,Shrimp Crangon spp,2004,54600.0,616845.0
5,Austria,"Shrimp, coldwater",2004,137500.0,1344032.0
6,Austria,"Shrimp, deep-water rose",2004,27000.0,355896.0
7,Austria,"Shrimp, miscellaneous",2004,2110900.0,11700756.0
8,Austria,"Shrimp, warmwater",2004,358800.0,3714873.0
9,Austria,"Tuna, albacore",2004,53300.0,200423.0


In [27]:
#Calculate Kg to tonnes
trading_total_import["volume_t"] = (trading_total_import["volume_kg"] / 1000)


# Drop column "volume_kg"
trading_total_import.drop(columns= {'volume_kg'}, inplace=True)
trading_total_import

Unnamed: 0,country,main_commercial_species,year,value_eur,volume_t
0,Austria,Lobster Homarus spp,2004,1165703.00,68.90
1,Austria,"Lobster, Norway",2004,166185.00,13.70
2,Austria,Rock lobster and sea crawfish,2004,198320.00,19.60
3,Austria,Salmon,2004,27836717.00,6342.60
4,Austria,Shrimp Crangon spp,2004,616845.00,54.60
...,...,...,...,...,...
7555,United Kingdom,"Tuna, bigeye",2021,,
7556,United Kingdom,"Tuna, bluefin",2021,,
7557,United Kingdom,"Tuna, miscellaneous",2021,,
7558,United Kingdom,"Tuna, skipjack",2021,,


### Step 8 Replace (country/species) abbreviations with full names by using dictionaries 

### Step 9 Inspect Null Values / NaNs and datatypes with info()  

In [28]:
# Fill NaN Values in dataframe with 0
trading_total_import.fillna(0, inplace=True)
trading_total_import.head()

Unnamed: 0,country,main_commercial_species,year,value_eur,volume_t
0,Austria,Lobster Homarus spp,2004,1165703.0,68.9
1,Austria,"Lobster, Norway",2004,166185.0,13.7
2,Austria,Rock lobster and sea crawfish,2004,198320.0,19.6
3,Austria,Salmon,2004,27836717.0,6342.6
4,Austria,Shrimp Crangon spp,2004,616845.0,54.6


### Step 10 Species cleaning  

* aggregate species = salmon, tuna, lobster, shrimp  
* sum up species in categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon

In [29]:
# Rename sub-species to main species
trading_total_import["main_commercial_species"]= trading_total_import["main_commercial_species"].replace(["Lobster Homarus spp", "Lobster, Norway","Rock lobster and sea crawfish"],"Lobster")
trading_total_import["main_commercial_species"]= trading_total_import["main_commercial_species"].replace(["Shrimp Crangon spp", "Shrimp, coldwater", "Shrimp, deep-water rose", "Shrimp, miscellaneous", "Shrimp, warmwater"],"Shrimp")
trading_total_import["main_commercial_species"]= trading_total_import["main_commercial_species"].replace(["Tuna, albacore", "Tuna, bigeye", "Tuna, bluefin", "Tuna, miscellaneous", "Tuna, skipjack", "Tuna, skipjack", "Tuna, yellowfin"],"Tuna")

display(trading_total_import["main_commercial_species"].unique())

array(['Lobster', 'Salmon', 'Shrimp', 'Tuna'], dtype=object)

In [30]:
# aggregate species to categories - categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon
# create list of conditions
conditions = [(trading_total_import["main_commercial_species"].str.contains("Salmon",case=False)),
            (trading_total_import["main_commercial_species"].str.contains("Tuna",case=False)), 
            (trading_total_import["main_commercial_species"].str.contains("Lobster",case=False)), 
            (trading_total_import["main_commercial_species"].str.contains("Shrimp",case=False))]
#create list of values we want to assign for each condition
values = ["Pelagic fish", "Pelagic fish", "Crustaceans", "Crustaceans"]

trading_total_import["categories_species"] = np.select(conditions, values)
trading_total_import.head()

Unnamed: 0,country,main_commercial_species,year,value_eur,volume_t,categories_species
0,Austria,Lobster,2004,1165703.0,68.9,Crustaceans
1,Austria,Lobster,2004,166185.0,13.7,Crustaceans
2,Austria,Lobster,2004,198320.0,19.6,Crustaceans
3,Austria,Salmon,2004,27836717.0,6342.6,Pelagic fish
4,Austria,Shrimp,2004,616845.0,54.6,Crustaceans


### Step 11 Decoding data

### Step 12 Punctuation = decimals separator: comma, thousands separator: dot    

### Step 13 language = english  

### Last step Merging dataframes: tbd after Steps 1 - 13 are done

In [31]:
#Rearrange order of columns in dataframe
trading_total_import = trading_total_import[['year', 'country', 'main_commercial_species','categories_species','volume_t','value_eur']]

In [32]:
#final dataframe
trading_total_import

Unnamed: 0,year,country,main_commercial_species,categories_species,volume_t,value_eur
0,2004,Austria,Lobster,Crustaceans,68.90,1165703.00
1,2004,Austria,Lobster,Crustaceans,13.70,166185.00
2,2004,Austria,Lobster,Crustaceans,19.60,198320.00
3,2004,Austria,Salmon,Pelagic fish,6342.60,27836717.00
4,2004,Austria,Shrimp,Crustaceans,54.60,616845.00
...,...,...,...,...,...,...
7555,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7556,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7557,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00
7558,2021,United Kingdom,Tuna,Pelagic fish,0.00,0.00


### Upload in DBeaver

In [33]:
# upload dataframe in dbeaver

table_name = 'total_import_by_country_and_species'
engine = get_engine()
schema = 'capstone_fish_are_friends'
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        trading_total_import.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The total_import_by_country_and_species table was imported successfully.


## <span style='color:green'> Development Values Export Import Species</span> 

### Cleaning Step 1: Import file and create dataframe with individual variable


In [34]:
development_values_species = pd.read_csv("/Users/nina/neuefische/capstone_project/data/seafood_trading/development_values_export_import_species_yearly.csv", delimiter=";")
development_values_species.head()

Unnamed: 0,YEAR,MAIN COMMERCIAL SPECIES,Measures,Unnamed: 3,Export,Import,Totals
0,2004.0,Lobster Homarus spp,VALUE (EUR),,70028763.0,245211167.0,315239930.0
1,,"Lobster, Norway",VALUE (EUR),,249149550.0,267286524.0,516436074.0
2,,Rock lobster and sea crawfish,VALUE (EUR),,67001294.0,162084904.0,229086198.0
3,,Salmon,VALUE (EUR),,1641170874.0,2628682085.0,4269852959.0
4,,Shrimp Crangon spp,VALUE (EUR),,152284605.0,142118882.0,294403487.0


### Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [35]:
# inspect the dataframe
development_values_species.info()
#development_values_species.head()
#development_values_species.shape
#development_values_species.columns
#development_values_species.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   YEAR                     20 non-null     object
 1   MAIN COMMERCIAL SPECIES  305 non-null    object
 2   Measures                 305 non-null    object
 3   Unnamed: 3               254 non-null    object
 4   Export                   305 non-null    object
 5   Import                   305 non-null    object
 6   Totals                   305 non-null    object
dtypes: object(7)
memory usage: 16.8+ KB


### Step 3 Delete non necessary columns and rows

In [36]:
# Drop unnecessary columns
development_values_species.drop(['Unnamed: 3', 'Totals'], axis=1, inplace=True)

# drop unnecessary rows
development_values_species= development_values_species.set_index("MAIN COMMERCIAL SPECIES")
development_values_species= development_values_species.drop("Totals", axis=0)
development_values_species.reset_index(inplace=True)

development_values_species= development_values_species.set_index("YEAR")
development_values_species.reset_index(inplace=True)

development_values_species.tail()

Unnamed: 0,YEAR,MAIN COMMERCIAL SPECIES,Measures,Export,Import
280,,"Tuna, bigeye",VALUE (EUR),9770000.0,9612000.0
281,,"Tuna, bluefin",VALUE (EUR),78229000.0,6594000.0
282,,"Tuna, miscellaneous",VALUE (EUR),237086000.0,288184000.0
283,,"Tuna, skipjack",VALUE (EUR),381358000.0,747437000.0
284,,"Tuna, yellowfin",VALUE (EUR),115738000.0,387920000.0


### Step 4 Rename columns lower case, snake case, spaces, delimiters &
### Step 5 Autofill Null Values when merged cell in excel/csv file is empty    

In [37]:
# Autofill NaN Values from merged cells with correct value 
development_values_species[['YEAR','MAIN COMMERCIAL SPECIES']] = development_values_species[['YEAR','MAIN COMMERCIAL SPECIES']].fillna(method='ffill', axis=0)
development_values_species.head(20)

#Removing whitespace & rename columns with lower cases
cols = development_values_species.columns.tolist()                                                                                                                           
cols = [str(col).replace(' ', '_').lower() for col in cols]
development_values_species.columns = cols
development_values_species.head()

#Rename measure Values
development_values_species = development_values_species.replace("VALUE (EUR)","Value (Eur)")
development_values_species.head()

Unnamed: 0,year,main_commercial_species,measures,export,import
0,2004,Lobster Homarus spp,Value (Eur),70028763.0,245211167.0
1,2004,"Lobster, Norway",Value (Eur),249149550.0,267286524.0
2,2004,Rock lobster and sea crawfish,Value (Eur),67001294.0,162084904.0
3,2004,Salmon,Value (Eur),1641170874.0,2628682085.0
4,2004,Shrimp Crangon spp,Value (Eur),152284605.0,142118882.0


### Step 6 Change data types if necessary   

* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals 

In [38]:
# Delete punctuation to change dtypes to floats
development_values_species.export = development_values_species.export.map(lambda x : x.replace(',', ''))
development_values_species["import"] = development_values_species["import"].map(lambda x : x.replace(',', ''))

#Inhalte von Value mit lambda ersetzen!!!
#development_values_species.main_commercial_species = development_values_species.main_commercial_species.map(lambda x : x.replace('spp', 'SPP'))

# Change datatype to floats
development_values_species.export = development_values_species.export.astype(float)
development_values_species["import"] = development_values_species["import"].astype(float)
development_values_species.year = development_values_species.year.astype(int)
development_values_species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285 entries, 0 to 284
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   year                     285 non-null    int64  
 1   main_commercial_species  285 non-null    object 
 2   measures                 285 non-null    object 
 3   export                   285 non-null    float64
 4   import                   285 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 11.3+ KB


###  Step 7 Rename row value names. First letter upper case e.g. EUR = Eur

In [39]:
#Rename columns
development_values_species.rename(columns= { 'measures': 'value_eur'}, inplace=True)
development_values_species

Unnamed: 0,year,main_commercial_species,value_eur,export,import
0,2004,Lobster Homarus spp,Value (Eur),70028763.00,245211167.00
1,2004,"Lobster, Norway",Value (Eur),249149550.00,267286524.00
2,2004,Rock lobster and sea crawfish,Value (Eur),67001294.00,162084904.00
3,2004,Salmon,Value (Eur),1641170874.00,2628682085.00
4,2004,Shrimp Crangon spp,Value (Eur),152284605.00,142118882.00
...,...,...,...,...,...
280,2022,"Tuna, bigeye",Value (Eur),9770000.00,9612000.00
281,2022,"Tuna, bluefin",Value (Eur),78229000.00,6594000.00
282,2022,"Tuna, miscellaneous",Value (Eur),237086000.00,288184000.00
283,2022,"Tuna, skipjack",Value (Eur),381358000.00,747437000.00


### Step 8 Replace (country/species) abbreviations with full names by using dictionaries 

### Step 9 Inspect Null Values / NaNs and datatypes with info()  

In [40]:
# Fill NaN Values in dataframe with 0
development_values_species.fillna(0, inplace=True)
development_values_species.head()

Unnamed: 0,year,main_commercial_species,value_eur,export,import
0,2004,Lobster Homarus spp,Value (Eur),70028763.0,245211167.0
1,2004,"Lobster, Norway",Value (Eur),249149550.0,267286524.0
2,2004,Rock lobster and sea crawfish,Value (Eur),67001294.0,162084904.0
3,2004,Salmon,Value (Eur),1641170874.0,2628682085.0
4,2004,Shrimp Crangon spp,Value (Eur),152284605.0,142118882.0


### Step 10 Species cleaning  

* aggregate species = salmon, tuna, lobster, shrimp  
* sum up species in categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon

In [41]:
# Rename sub-species to main species
development_values_species["main_commercial_species"]= development_values_species["main_commercial_species"].replace(["Lobster Homarus spp", "Lobster, Norway","Rock lobster and sea crawfish"],"Lobster")
development_values_species["main_commercial_species"]= development_values_species["main_commercial_species"].replace(["Shrimp Crangon spp", "Shrimp, coldwater", "Shrimp, deep-water rose", "Shrimp, miscellaneous", "Shrimp, warmwater"],"Shrimp")
development_values_species["main_commercial_species"]= development_values_species["main_commercial_species"].replace(["Tuna, albacore", "Tuna, bigeye", "Tuna, bluefin", "Tuna, miscellaneous", "Tuna, skipjack", "Tuna, skipjack", "Tuna, yellowfin"],"Tuna")

display(development_values_species["main_commercial_species"].unique())

array(['Lobster', 'Salmon', 'Shrimp', 'Tuna'], dtype=object)

In [42]:
# aggregate species to categories - categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon
# create list of conditions
conditions = [(development_values_species["main_commercial_species"].str.contains("Salmon",case=False)),
            (development_values_species["main_commercial_species"].str.contains("Tuna",case=False)), 
            (development_values_species["main_commercial_species"].str.contains("Lobster",case=False)), 
            (development_values_species["main_commercial_species"].str.contains("Shrimp",case=False))]
#create list of values we want to assign for each condition
values = ["Pelagic fish", "Pelagic fish", "Crustaceans", "Crustaceans"]

development_values_species["categories_species"] = np.select(conditions, values)
development_values_species.head()

Unnamed: 0,year,main_commercial_species,value_eur,export,import,categories_species
0,2004,Lobster,Value (Eur),70028763.0,245211167.0,Crustaceans
1,2004,Lobster,Value (Eur),249149550.0,267286524.0,Crustaceans
2,2004,Lobster,Value (Eur),67001294.0,162084904.0,Crustaceans
3,2004,Salmon,Value (Eur),1641170874.0,2628682085.0,Pelagic fish
4,2004,Shrimp,Value (Eur),152284605.0,142118882.0,Crustaceans


### Step 11 Decoding data

### Step 12 Punctuation = decimals separator: comma, thousands separator: dot    

### Step 13 language = english  

### Last step Merging dataframes: tbd after Steps 1 - 13 are done

In [43]:
#Rearrange order of columns in dataframe
development_values_species = development_values_species[['year', 'main_commercial_species','categories_species','value_eur','export', 'import']]

In [44]:
#final dataframe
development_values_species

Unnamed: 0,year,main_commercial_species,categories_species,value_eur,export,import
0,2004,Lobster,Crustaceans,Value (Eur),70028763.00,245211167.00
1,2004,Lobster,Crustaceans,Value (Eur),249149550.00,267286524.00
2,2004,Lobster,Crustaceans,Value (Eur),67001294.00,162084904.00
3,2004,Salmon,Pelagic fish,Value (Eur),1641170874.00,2628682085.00
4,2004,Shrimp,Crustaceans,Value (Eur),152284605.00,142118882.00
...,...,...,...,...,...,...
280,2022,Tuna,Pelagic fish,Value (Eur),9770000.00,9612000.00
281,2022,Tuna,Pelagic fish,Value (Eur),78229000.00,6594000.00
282,2022,Tuna,Pelagic fish,Value (Eur),237086000.00,288184000.00
283,2022,Tuna,Pelagic fish,Value (Eur),381358000.00,747437000.00


### Upload to DBeaver

In [45]:
# upload dataframe in dbeaver

table_name = 'development_values_species_exp_imp'
engine = get_engine()
schema = 'capstone_fish_are_friends'
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        development_values_species.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The development_values_species_exp_imp table was imported successfully.


## <span style='color:green'> Development Volume Export Import Species</span> 

### Cleaning Step 1: Import file and create dataframe with individual variable


In [46]:
development_volumes_species = pd.read_csv("/Users/nina/neuefische/capstone_project/data/seafood_trading/development_volume_export_import_species_yearly.csv", delimiter=";")
development_volumes_species.head()

Unnamed: 0,YEAR,MAIN COMMERCIAL SPECIES,Measures,Unnamed: 3,Export,Import,Totals
0,2004.0,Lobster Homarus spp,VOLUME (KG),,4863500.0,20418700.0,25282200.0
1,,"Lobster, Norway",VOLUME (KG),,36240800.0,40461100.0,76701900.0
2,,Rock lobster and sea crawfish,VOLUME (KG),,4873000.0,10965400.0,15838400.0
3,,Salmon,VOLUME (KG),,407684300.0,752082700.0,1159767000.0
4,,Shrimp Crangon spp,VOLUME (KG),,48259100.0,29864800.0,78123900.0


### Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [47]:
# inspect the dataframe
development_volumes_species.info()
#development_volumes_species.head()
#development_volumes_species.shape
#development_volumes_species.columns
#development_volumes_species.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   YEAR                     20 non-null     object
 1   MAIN COMMERCIAL SPECIES  305 non-null    object
 2   Measures                 305 non-null    object
 3   Unnamed: 3               254 non-null    object
 4   Export                   305 non-null    object
 5   Import                   305 non-null    object
 6   Totals                   305 non-null    object
dtypes: object(7)
memory usage: 16.8+ KB


### Step 3 Delete non necessary columns and rows

In [48]:
# Drop unnecessary columns
development_volumes_species.drop(['Unnamed: 3', 'Totals'], axis=1, inplace=True)

# drop unnecessary rows
development_volumes_species= development_volumes_species.set_index("MAIN COMMERCIAL SPECIES")
development_volumes_species= development_volumes_species.drop("Totals", axis=0)
development_volumes_species.reset_index(inplace=True)

development_volumes_species= development_volumes_species.set_index("YEAR")
development_volumes_species.reset_index(inplace=True)

development_volumes_species.tail()

Unnamed: 0,YEAR,MAIN COMMERCIAL SPECIES,Measures,Export,Import
280,,"Tuna, bigeye",VOLUME (KG),5561000.0,3725000.0
281,,"Tuna, bluefin",VOLUME (KG),5875000.0,585000.0
282,,"Tuna, miscellaneous",VOLUME (KG),34231000.0,46820000.0
283,,"Tuna, skipjack",VOLUME (KG),106578000.0,172950000.0
284,,"Tuna, yellowfin",VOLUME (KG),25812000.0,77110000.0


### Step 4 Rename columns lower case, snake case, spaces, delimiters &
### Step 5 Autofill Null Values when merged cell in excel/csv file is empty    

In [49]:
# Autofill NaN Values from merged cells with correct value 
development_volumes_species[['YEAR','MAIN COMMERCIAL SPECIES']] = development_volumes_species[['YEAR','MAIN COMMERCIAL SPECIES']].fillna(method='ffill', axis=0)
development_volumes_species.head(20)

#Removing whitespace & rename columns with lower cases
cols = development_volumes_species.columns.tolist()                                                                                                                           
cols = [str(col).replace(' ', '_').lower() for col in cols]
development_volumes_species.columns = cols
development_volumes_species.head()

#Rename measure Values
development_volumes_species = development_volumes_species.replace("VOLUME (KG)","Volume(t)")
development_volumes_species.head()

Unnamed: 0,year,main_commercial_species,measures,export,import
0,2004,Lobster Homarus spp,Volume(t),4863500.0,20418700.0
1,2004,"Lobster, Norway",Volume(t),36240800.0,40461100.0
2,2004,Rock lobster and sea crawfish,Volume(t),4873000.0,10965400.0
3,2004,Salmon,Volume(t),407684300.0,752082700.0
4,2004,Shrimp Crangon spp,Volume(t),48259100.0,29864800.0


### Step 6 Change data types if necessary   

* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals 

In [50]:
# Delete punctuation to change dtypes to floats
development_volumes_species.export = development_volumes_species.export.map(lambda x : x.replace(',', ''))
development_volumes_species["import"] = development_volumes_species["import"].map(lambda x : x.replace(',', ''))

#Inhalte von Value mit lambda ersetzen!!!
#development_volumes_species.main_commercial_species = development_volumes_species.main_commercial_species.map(lambda x : x.replace('spp', 'SPP'))

# Change datatype to floats
development_volumes_species.export = development_volumes_species.export.astype(float)
development_volumes_species["import"] = development_volumes_species["import"].astype(float)
development_volumes_species.year = development_volumes_species.year.astype(int)
development_volumes_species.head()

Unnamed: 0,year,main_commercial_species,measures,export,import
0,2004,Lobster Homarus spp,Volume(t),4863500.0,20418700.0
1,2004,"Lobster, Norway",Volume(t),36240800.0,40461100.0
2,2004,Rock lobster and sea crawfish,Volume(t),4873000.0,10965400.0
3,2004,Salmon,Volume(t),407684300.0,752082700.0
4,2004,Shrimp Crangon spp,Volume(t),48259100.0,29864800.0


In [51]:
#Calculate Kg to tonnes
development_volumes_species["import_tonnes"] = (development_volumes_species["import"] / 1000)
development_volumes_species["export_tonnes"] = (development_volumes_species["export"] / 1000)

#drop unnecessary columns
development_volumes_species.drop(["import", "export"],axis=1, inplace=True)

#rename columns
development_volumes_species.rename(columns={"import_tonnes": "import","measures": "volume_t", "export_tonnes": "export"}, inplace=True)

development_volumes_species.head()

Unnamed: 0,year,main_commercial_species,volume_t,import,export
0,2004,Lobster Homarus spp,Volume(t),20418.7,4863.5
1,2004,"Lobster, Norway",Volume(t),40461.1,36240.8
2,2004,Rock lobster and sea crawfish,Volume(t),10965.4,4873.0
3,2004,Salmon,Volume(t),752082.7,407684.3
4,2004,Shrimp Crangon spp,Volume(t),29864.8,48259.1


###  Step 7 Rename row value names. First letter upper case e.g. EUR = Eur

### Step 8 Replace (country/species) abbreviations with full names by using dictionaries 

### Step 9 Inspect Null Values / NaNs and datatypes with info()  

In [52]:
# Fill NaN Values in dataframe with 0
development_volumes_species.fillna(0, inplace=True)
development_volumes_species.head()

Unnamed: 0,year,main_commercial_species,volume_t,import,export
0,2004,Lobster Homarus spp,Volume(t),20418.7,4863.5
1,2004,"Lobster, Norway",Volume(t),40461.1,36240.8
2,2004,Rock lobster and sea crawfish,Volume(t),10965.4,4873.0
3,2004,Salmon,Volume(t),752082.7,407684.3
4,2004,Shrimp Crangon spp,Volume(t),29864.8,48259.1


### Step 10 Species cleaning  

* aggregate species = salmon, tuna, lobster, shrimp  
* sum up species in categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon

In [53]:
# Rename sub-species to main species
development_volumes_species["main_commercial_species"]= development_volumes_species["main_commercial_species"].replace(["Lobster Homarus spp", "Lobster, Norway","Rock lobster and sea crawfish"],"Lobster")
development_volumes_species["main_commercial_species"]= development_volumes_species["main_commercial_species"].replace(["Shrimp Crangon spp", "Shrimp, coldwater", "Shrimp, deep-water rose", "Shrimp, miscellaneous", "Shrimp, warmwater"],"Shrimp")
development_volumes_species["main_commercial_species"]= development_volumes_species["main_commercial_species"].replace(["Tuna, albacore", "Tuna, bigeye", "Tuna, bluefin", "Tuna, miscellaneous", "Tuna, skipjack", "Tuna, skipjack", "Tuna, yellowfin"],"Tuna")

display(development_volumes_species["main_commercial_species"].unique())

array(['Lobster', 'Salmon', 'Shrimp', 'Tuna'], dtype=object)

In [54]:
# aggregate species to categories - categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon
# create list of conditions
conditions = [(development_volumes_species["main_commercial_species"].str.contains("Salmon",case=False)),
            (development_volumes_species["main_commercial_species"].str.contains("Tuna",case=False)), 
            (development_volumes_species["main_commercial_species"].str.contains("Lobster",case=False)), 
            (development_volumes_species["main_commercial_species"].str.contains("Shrimp",case=False))]
#create list of values we want to assign for each condition
values = ["Pelagic fish", "Pelagic fish", "Crustaceans", "Crustaceans"]

development_volumes_species["categories_species"] = np.select(conditions, values)
development_volumes_species.head()

Unnamed: 0,year,main_commercial_species,volume_t,import,export,categories_species
0,2004,Lobster,Volume(t),20418.7,4863.5,Crustaceans
1,2004,Lobster,Volume(t),40461.1,36240.8,Crustaceans
2,2004,Lobster,Volume(t),10965.4,4873.0,Crustaceans
3,2004,Salmon,Volume(t),752082.7,407684.3,Pelagic fish
4,2004,Shrimp,Volume(t),29864.8,48259.1,Crustaceans


### Step 11 Decoding data

### Step 12 Punctuation = decimals separator: comma, thousands separator: dot    

### Step 13 language = english  

### Last step Merging dataframes: tbd after Steps 1 - 13 are done

In [55]:
#Rearrange order of columns in dataframe
development_volumes_species = development_volumes_species[['year', 'main_commercial_species','categories_species','volume_t','import','export']]

In [56]:
#final dataframe
development_volumes_species

Unnamed: 0,year,main_commercial_species,categories_species,volume_t,import,export
0,2004,Lobster,Crustaceans,Volume(t),20418.70,4863.50
1,2004,Lobster,Crustaceans,Volume(t),40461.10,36240.80
2,2004,Lobster,Crustaceans,Volume(t),10965.40,4873.00
3,2004,Salmon,Pelagic fish,Volume(t),752082.70,407684.30
4,2004,Shrimp,Crustaceans,Volume(t),29864.80,48259.10
...,...,...,...,...,...,...
280,2022,Tuna,Pelagic fish,Volume(t),3725.00,5561.00
281,2022,Tuna,Pelagic fish,Volume(t),585.00,5875.00
282,2022,Tuna,Pelagic fish,Volume(t),46820.00,34231.00
283,2022,Tuna,Pelagic fish,Volume(t),172950.00,106578.00


### Upload to DBeaver

In [57]:
# upload dataframe in dbeaver

table_name = 'development_volumes_species'
engine = get_engine()
schema = 'capstone_fish_are_friends'
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        development_volumes_species.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The development_volumes_species table was imported successfully.


## <span style='color:green'> Seafood Import Export Quantity EU</span> 

### Cleaning Step 1: Import file and create dataframe with individual variable


In [58]:
seafood_imp_exp_quantity= pd.read_excel("/Users/nina/neuefische/capstone_project/data/seafood_trading/seafood_import_export_quantity_eu_countries_yearly_fao.xlsx")
seafood_imp_exp_quantity.tail(20)

Unnamed: 0,Domain Code,Area Code (FAO),Area,Element Code,Element,Item Code,Item,Year,Volume in tonnes
1100,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2010,84800.0
1101,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2011,81540.0
1102,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2012,80600.0
1103,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2013,86140.0
1104,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2014,85540.0
1105,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2015,74740.0
1106,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2016,78220.0
1107,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2017,85080.0
1108,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2018,85080.0
1109,FBS,210,Sweden,5611,Import Quantity,2765,Crustaceans,2019,85080.0


### Step 2: Inspect dataframe with head(), info(), shape, columns, tail(), describe()

In [59]:
# inspect the dataframe
seafood_imp_exp_quantity.info()
#seafood_imp_exp_quantity.head()
#seafood_imp_exp_quantity.shape
#seafood_imp_exp_quantity.columns
#seafood_imp_exp_quantity.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1120 entries, 0 to 1119
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Domain Code       1120 non-null   object 
 1   Area Code (FAO)   1120 non-null   int64  
 2   Area              1120 non-null   object 
 3   Element Code      1120 non-null   int64  
 4   Element           1120 non-null   object 
 5   Item Code         1120 non-null   int64  
 6   Item              1120 non-null   object 
 7   Year              1120 non-null   int64  
 8   Volume in tonnes  1104 non-null   float64
dtypes: float64(1), int64(4), object(4)
memory usage: 78.9+ KB


### Step 3 Delete non necessary columns and rows

In [60]:
# Drop unnecessary columns
seafood_imp_exp_quantity.drop(['Domain Code', 'Area Code (FAO)', 'Element Code', 'Item Code' ], axis=1, inplace=True)
seafood_imp_exp_quantity

Unnamed: 0,Area,Element,Item,Year,Volume in tonnes
0,Austria,Import Quantity,Pelagic Fish,2010,66580.00
1,Austria,Import Quantity,Pelagic Fish,2011,76520.00
2,Austria,Import Quantity,Pelagic Fish,2012,71340.00
3,Austria,Import Quantity,Pelagic Fish,2013,72420.00
4,Austria,Import Quantity,Pelagic Fish,2014,66330.00
...,...,...,...,...,...
1115,Sweden,Export Quantity,Crustaceans,2015,7100.00
1116,Sweden,Export Quantity,Crustaceans,2016,7260.00
1117,Sweden,Export Quantity,Crustaceans,2017,14260.00
1118,Sweden,Export Quantity,Crustaceans,2018,14260.00


### Step 4 Rename columns lower case, snake case, spaces, delimiters &
### Step 5 Autofill Null Values when merged cell in excel/csv file is empty    

In [61]:
#Removing whitespace & rename columns with lower cases
cols = seafood_imp_exp_quantity.columns.tolist()                                                                                                                           
cols = [str(col).replace(' ', '_').lower() for col in cols]
seafood_imp_exp_quantity.columns = cols
seafood_imp_exp_quantity.head(20)

Unnamed: 0,area,element,item,year,volume_in_tonnes
0,Austria,Import Quantity,Pelagic Fish,2010,66580.0
1,Austria,Import Quantity,Pelagic Fish,2011,76520.0
2,Austria,Import Quantity,Pelagic Fish,2012,71340.0
3,Austria,Import Quantity,Pelagic Fish,2013,72420.0
4,Austria,Import Quantity,Pelagic Fish,2014,66330.0
5,Austria,Import Quantity,Pelagic Fish,2015,63820.0
6,Austria,Import Quantity,Pelagic Fish,2016,60440.0
7,Austria,Import Quantity,Pelagic Fish,2017,63770.0
8,Austria,Import Quantity,Pelagic Fish,2018,63770.0
9,Austria,Import Quantity,Pelagic Fish,2019,63770.0


In [62]:
# Split Import and Export data in two different columns

#create two new dataframes to seperate eleemnt values in two new columns
seafood_import = seafood_imp_exp_quantity[seafood_imp_exp_quantity["element"] == "Import Quantity"]
seafood_export = seafood_imp_exp_quantity[seafood_imp_exp_quantity["element"] == "Export Quantity"]

# Merge newly created dataframes 
seafood_merge = seafood_import.merge(seafood_export, how ="left", on = ["area","item","year"])
seafood_merge.head()

# Drop unnecessary columns
seafood_merge.drop(columns=["element_x", "element_y"], inplace=True)
seafood_merge.head()

# Rename columns quantity_x and quantity_y
seafood_merge.rename(columns= {'volume_in_tonnes_x': 'import_t', 'volume_in_tonnes_y': 'export_t'}, inplace=True)

#Overwrite new dataframe with old dataframe name
seafood_imp_exp_quantity = seafood_merge
seafood_imp_exp_quantity.tail(20)


Unnamed: 0,area,item,year,import_t,export_t
540,Sweden,Pelagic Fish,2010,130410.0,200800.0
541,Sweden,Pelagic Fish,2011,124140.0,163710.0
542,Sweden,Pelagic Fish,2012,103130.0,139530.0
543,Sweden,Pelagic Fish,2013,111170.0,149240.0
544,Sweden,Pelagic Fish,2014,106650.0,146990.0
545,Sweden,Pelagic Fish,2015,109280.0,157640.0
546,Sweden,Pelagic Fish,2016,129030.0,180090.0
547,Sweden,Pelagic Fish,2017,119060.0,200090.0
548,Sweden,Pelagic Fish,2018,119060.0,200090.0
549,Sweden,Pelagic Fish,2019,119060.0,200090.0


### Step 6 Change data types if necessary   

* year = integer
* volume = float, round to one decimal. Convert kilograms into tonnes (1t = 1000 kg)
* value = float, round to two decimals 

In [63]:
seafood_imp_exp_quantity.info()
# not necessary

<class 'pandas.core.frame.DataFrame'>
Int64Index: 560 entries, 0 to 559
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   area      560 non-null    object 
 1   item      560 non-null    object 
 2   year      560 non-null    int64  
 3   import_t  553 non-null    float64
 4   export_t  551 non-null    float64
dtypes: float64(2), int64(1), object(2)
memory usage: 26.2+ KB


###  Step 7 Rename row value names. First letter upper case e.g. EUR = Eur

In [64]:
#Rename columns
seafood_imp_exp_quantity.rename(columns= { 'area': 'country', 'item':'categories_species'}, inplace=True)
seafood_imp_exp_quantity

#Rename rows
seafood_imp_exp_quantity["categories_species"]= seafood_imp_exp_quantity["categories_species"].map(lambda x: x.replace("Pelagic Fish", "Pelagic fish"))
seafood_imp_exp_quantity.head()

#Rearrange order of columns in dataframe
seafood_imp_exp_quantity = seafood_imp_exp_quantity[['year', 'country','categories_species','import_t', 'export_t']]
seafood_imp_exp_quantity.head()

Unnamed: 0,year,country,categories_species,import_t,export_t
0,2010,Austria,Pelagic fish,66580.0,4950.0
1,2011,Austria,Pelagic fish,76520.0,5520.0
2,2012,Austria,Pelagic fish,71340.0,7880.0
3,2013,Austria,Pelagic fish,72420.0,11020.0
4,2014,Austria,Pelagic fish,66330.0,3880.0


### Step 8 Replace (country/species) abbreviations with full names by using dictionaries 

### Step 9 Inspect Null Values / NaNs and datatypes with info()  

### Step 10 Species cleaning  

* aggregate species = salmon, tuna, lobster, shrimp  
* sum up species in categories_species = crustaceans: shrimps and lobster; pelagic fish: tuna and salmon

### Step 11 Decoding data

### Step 12 Punctuation = decimals separator: comma, thousands separator: dot    

### Step 13 language = english  

### Last step Merging dataframes: tbd after Steps 1 - 13 are done

In [65]:
#finale datei
seafood_imp_exp_quantity

Unnamed: 0,year,country,categories_species,import_t,export_t
0,2010,Austria,Pelagic fish,66580.00,4950.00
1,2011,Austria,Pelagic fish,76520.00,5520.00
2,2012,Austria,Pelagic fish,71340.00,7880.00
3,2013,Austria,Pelagic fish,72420.00,11020.00
4,2014,Austria,Pelagic fish,66330.00,3880.00
...,...,...,...,...,...
555,2015,Sweden,Crustaceans,74740.00,7100.00
556,2016,Sweden,Crustaceans,78220.00,7260.00
557,2017,Sweden,Crustaceans,85080.00,14260.00
558,2018,Sweden,Crustaceans,85080.00,14260.00


### Upload to DBeaver

In [66]:
# upload dataframe in dbeaver

table_name = 'seafood_imp_exp_quantity'
engine = get_engine()
schema = 'capstone_fish_are_friends'
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        seafood_imp_exp_quantity.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

The seafood_imp_exp_quantity table was imported successfully.
