# MA
## Creation of a process for scientific visualization development based on the example of the new ZHAW protein source database
## Christina Köck
## July 2023
### Link to the Gitlab-Repo: https://github.com/TinyTen/MA

Creation of visualizations for study result comparison for iteration 2.

In [1]:
import warnings
warnings.filterwarnings("ignore")

### Libraries and data

In [2]:
from cmcrameri import cm
import math as math
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import plotly.express as px

import sys

sys.path.insert(1, '../Data')
from colors_cameri import davos, oslo

import missingno as msno
import sparql_dataframe



## scientific colormaps (see http://www.fabiocrameri.ch/visualisation.php)

In [3]:
from colors_cameri import bilbao

In [4]:
davos_rgb = [el[1] for el in davos]
oslo_rgb = [el[1] for el in oslo]

In [None]:
# colors =  dict(zip(df_food["Category"].unique(), tofino_rgb))

In [None]:
zhaw_color = (0.00000 , 0.39216 , 0.65098)

#### Data

In [None]:
# Data are from:
# ZHAW database mix from end of 2022
df_studies = pd.read_excel("../Data/study_data.xlsx")
df_studies.rename(columns={"Unnamed: 0": "source"}, inplace=True)

In [None]:
df_studies.set_index('source', inplace = True)

In [None]:
df_studies['Datum der Studie (Jahr)']= pd.to_datetime(df_studies['Datum der Studie (Jahr)'])

### Short EDA

In [None]:
df_studies

In [None]:
sources = df_studies.index

In [None]:
# choose numeric data
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df_numeric_studies = df_studies.select_dtypes(include=numerics).reset_index(drop= True)

In [None]:
df_numeric_studies

In [None]:
# divide dataframe per protein, but there are a lot of missing data
df_studies_per_g_protein = df_numeric_studies.div(df_numeric_studies['Protein [%]'].dropna(), axis = 0)

In [None]:
# create dictionary to reduce the long study-links to a number

studies_dict = dict(zip(list(range(len(df_studies['Studie']))), df_studies['Studie'], ))
df_studies['StudieNr'] = studies_dict.keys()

In [None]:
studies_dict

In [None]:
# for the sake of simplicity, the protein from mealworm is also counted as just mealworm

df_studies.rename(index = {'protein from mealworm': 'mealworm'}, inplace=True)

### Countplot of all protein sources in plotly

In [None]:
fig = px.histogram(df_studies,
                 x = df_studies.index, 
                   color_discrete_sequence=oslo_rgb,

                  title = ('Count of Protein Sources, that are currently contained in the Studies in the Database.')
                  )
fig.update_layout(xaxis={'categoryorder':'total ascending'})
fig.show()
# fig.write_html("CountStudySources3.html")

## Show missing data as heatmap (plotly)

In [None]:
import plotly

# chose studies to display (index), all parameters are used in this example
choice = df_studies.index[:50]

df_plot = df_studies.loc[choice].isna()

title_text = title = 'Count of unknown values in the database. Dark color signifies <br> known values, bright color signifies unknown value.'

plt.figure(figsize=(20, 20))

# initiate plot, chose size and color
fig = px.imshow(df_plot, text_auto=False, aspect="auto", width=1200,height=800,
    color_continuous_scale=oslo,               )

# set the labels on top
fig.update_xaxes(side = "top")
fig.update_layout(title_text=title_text,title_y = 0.95)
fig.show()
# fig.write_html("MissingDataStudyResults3.html")

### create lists of the parameters to faster select them in the plots (not used)

In [None]:
nutrition =  [ 'Water [%]', 'Protein [%]', 'Fat [%]', 'Fiber [%]', 'Carbohydrates [%]',
       'Energy [kcal/100g]', 'Ash [%]', 'Magnesium (mg/100g)',
       'Zinc (mg/100g)', 'Iron (mg/100g)', 'Copper (mg/100g)',
       'Manganese (mg/100g)', 'Calcium (mg/100g)', 'Chloride (mg/100g)',
       'Potassium (mg/100g)', 'Sodium (mg/100g)', 'Phosphorus (mg/100g)',
       'Selenium (mg/100g)']

In [None]:
ecology = [ 'GWP, kg CO2-EQ/kg product', 
       'EU, Fossil energy use, MJ/kg product', 
       'LU, Land use, m2/Jahr', 
       'TAP, terrestrial acidification potential, g SO2-eq ',
        'FEP, freshwater eutrophication potential, g P-eq ',
        'Water Footprint (WF), m3/kg', 'Bemerkung zu WF',
       'Marine eutrophication (ME), kg N eq /kg', 
       'Ozone depletion (OD), kg CFC-11 eq/ kg', 
       'Photochemical oxidant formation (POF), kg NMVOC eq/ kg',
       'Particulate matter formation (PMF), kg PM10 eq/ kg', 
       'Fossil depletion (FD), kg oil eq/ kg']

In [None]:
all_aspects = nutrition + ecology

In [None]:
def filter_data(df = df_studies,
               choice_aspects = all_aspects
               ):

    df = df[['Studie', 'StudieNr'] + choice_aspects]
    
    return df

In [None]:
df_show = filter_data(df = df_studies)

## Barplots for comparing several parameters for chosen studies in Plotly

In [None]:
# https://community.plotly.com/t/how-to-set-different-x-and-y-axis-for-each-subplot/57417

import plotly.express as px
import pandas as pd

# choose parameters and sources
column = ['Fat [%]',  'Protein [%]', 'GWP, kg CO2-EQ/kg product']
ncols = len(column)
source = [ 'grasshopper', 'snail']

# create dataframe to plot with choice and transpose
df_plot = df_show.loc[source][column].reset_index(drop = True).T

# initiate plot, choose subplot columns ( = index = parameters) and spacing, choose colors and title
fig = px.bar(df_plot, 
             barmode="group",
             facet_col=df_plot.index, 
             facet_col_spacing=0.06,
             color_discrete_sequence=oslo_rgb[:10],
    color_continuous_scale=None, orientation = 'v'
    , title = 'Values for chosen parameter {}<br>for studies about {}.<br>The different studies are placed on the x-axis, encoded by numbers (see legend) because the names are<br>too long and sorted in ascending order for each subplot. See studies dictionary for the translation of <br>numbers into study titles.'.format(column, source))
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)

# set distance between label and plot
fig.update_layout(
                          margin={'t': 250},

            legend=dict(title = 'Study number',
                       )
        )

# show numbers as x-lables instead of study links
fig.update_yaxes(ticktext = list(range(0, 6)))

# split original label (index = parameter) by =, only use parameter
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1]))


fig.show()
# fig.write_html("BarsStudyResultsGram3.html")

#### Same for data per protein

In [None]:
# https://community.plotly.com/t/how-to-set-different-x-and-y-axis-for-each-subplot/57417

import plotly.express as px
import pandas as pd


# similar to above with dataframe per protein:
# first add column source
df_studies_per_g_protein['source']= sources
df_studies_per_g_protein['StudieNr']= studies_dict.keys()

# choose columns and sources
column = ['Fat [%]', 'Water [%]', 'Protein [%]', 'Energy [kcal/100g]']
ncols = len(column)
source = ['mealworm', 'grasshopper']

# create dataframe with choice and transpose
df_plot = df_studies_per_g_protein.set_index('source').loc[source][column].reset_index(drop = True).T

# initiate plot, index (parameters) are the subplots
fig = px.bar(df_plot,
             barmode="group",
             facet_col=df_plot.index, 
             color_discrete_sequence=oslo_rgb,
                          facet_col_spacing=0.06,
    color_continuous_scale=None, orientation = 'v'
    , title = 'Values for chosen parameter {} PER 100g PROTEIN <br>for studies about {}.<br>The different studies are placed on the x-axis, encoded by numbers (see legend = variable) and sorted<br>in ascending order for each subplot. See studies dictionary for the numbers.'.format(column, source))
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)

# set distance between label and plot
fig.update_layout(
                          margin={'t': 200})
fig.show()
# fig.write_html("BarsStudyResultsProtein3.html")