In [28]:
YEAR_START = 2013
YEAR_NOW = 2021


#NOW PLOT IT
#try using plotly
import yaml
import pandas as pd
import plotly.express as px
from plotly.validators.scatter.marker import SymbolValidator
import pandas as pd
import plotly.graph_objects as go
import numpy as np
import datetime
import math


### LOAD CONFIG ####
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

config["reverted_merge_dict"] = dict()
for key, values in config["merge_dict"].items():
    for value in values:
        config["reverted_merge_dict"][value] = key


#Defining Plotting Function
def prep_data(ito, anchor, group=False, max_n_split=100, write_level_keys=False):

    #Read input
    input_file_name=f"data/sota_{ito}.csv"
    sota = pd.read_csv(input_file_name)

    sota["date"] = pd.to_datetime(sota["date"])
    sota = sota[sota["date"] >= datetime.datetime(YEAR_START,1,1)]
    sota = sota[sota["date"] <= datetime.datetime(YEAR_NOW,12,31)]
    sota = sota.sort_values(by="date")

    #metricName causing some problems. Removed!
    sota = sota[sota["metric"]!="Parameters"]


    def agg_(ex):

        # calculate ratio
        min_ = ex["result"].min()
        max_ = ex["result"].max()
        assert ex["result"].iloc[0] == min_
        assert ex["result"].iloc[-1] == max_

        # add previous result (offset 1)
        ex["prev_result"] = ex["result"].shift(periods=1, fill_value=-1)

        # calculate ratio using prev result
        ex["ratio"] = (ex["result"] - ex["prev_result"]) / (max_ - min_)

        # set all anchors to -1
        ex.loc[ex["prev_result"] == -1, "ratio"] = -1

        # remove prev result from df again
        ex.drop('prev_result', axis=1, inplace=True)
        
        if len(ex) <= 2:
            return ex.head(0)

        return ex
    
    # key for grouping
    grp = ["l1", "l2", "l3", "task", "dataset", "metric"]
    # aggregate groups
    sota = sota.groupby(grp, as_index=False, dropna=False).apply(agg_).reset_index(drop=True)

    

    if group:
        n_sota = len(sota)
        # determine level
        def det_lvl(ex, lvl):
            """
            if ex[f"l{lvl+1}"].all() == False:
                assert lvl != 1
                ex["level"] = lvl - 1
                ex["level_key"] = "-".join([x for x in range(1,lvl - 1)]
            """
            lvls = [f"l{lvl_}" for lvl_ in range(1,lvl+1)]

            lookup = set()  # a temporary lookup set
            level_key = " // ".join([x for x in list(ex[lvls].iloc[0]) if x not in lookup and lookup.add(x) is None])
            if (level_key not in config["expand_list"]) and ((lvl == 3) or (len(ex[lvls + ["date", "task", "dataset", "metric", "ratio"]].drop_duplicates()) < max_n_split) or (ex[f"l{lvl+1}"].all() == False)):
                ex["level"] = lvl
                ex["level_key"] = level_key
                return ex
            else:
                next_level = ex.groupby([f"l{lvl+1}"], as_index=False).apply(det_lvl, lvl+1).reset_index(drop=True)
                return next_level
        sota = sota.groupby(["l1"], as_index=False).apply(det_lvl, 1).reset_index(drop=True)
        assert len(sota) == n_sota
        sota = sota.drop_duplicates(subset=["level_key", "date", "task", "dataset", "metric", "ratio"])
    else:
        sota["level"] = 3
        lookup = set()  # a temporary lookup set
        sota["level_key"] = " // ".join([x for x in list(sota[["l1", "l2", "l3"]].iloc[0]) if x not in lookup and lookup.add(x) is None]) + ": " + sota["task"]

    #FILTER trajectories based on yml config
    # remove unwanted trajectories
    sota = sota[~sota["level_key"].isin(config["remove_list"])]
    # merge
    sota["level_key"] = sota["level_key"].apply(lambda x: config["reverted_merge_dict"].get(x, x))
    # shorten
    for key, value in config["shortening_dict"].items():
        sota["level_key"] = sota["level_key"].str.replace(value, key)

    sota = sota.sort_values('level_key')

    if write_level_keys:
        sota[["level_key", "l1", "l2", "l3", "task"]].drop_duplicates().to_csv(f"./data/global_map_level_keys_{ito}.csv", index=False) 
    
    # Filter datasets containing single entry
    # sota = filter_entries_count(sota, lambda x: sota["dataset"] + " - " + sota["task"] + " - " + sota["l3"] + " - " + sota["l2"] + " - " + sota["l1"])

    # Filter tasks containing single entry
    # sota = filter_entries_count(sota, lambda x: x["superclass"] if not group else x["superclass"] + ": " + x["task"])
      
    trajectories = sota[sota["ratio"] != -1]
    trajectories["ratio"] = trajectories["ratio"].apply(lambda x: round(x, 4))

    # filter pre aggregation
    # This block will take the values from average_summary_IN and delete those that have only one arrow per trajectories.
    # if(anchor==0):
    #    trajectories = filter_entries_count(trajectories, lambda x: x["superclass"] if not group else x["superclass"] + ": " + x["task"])
    def agg(ex):
      ex["dataset"] = "  " + ("" if not group else (ex["task"] + " - " )) + ex["dataset"] + ": " + ex["metric"] + "<BR>"
      return pd.Series({"dataset": "".join(ex["dataset"].sort_values().unique()), "ratio": ex["ratio"].max()})      # CHANGE BY MATTHIAS: .mean() ->  .max()
    trajectories = pd.DataFrame(trajectories.groupby(["level_key", "date"])["task", "dataset", "metric", "ratio"].apply(agg))
    trajectories.sort_values(by=["date"], ascending=True)
    trajectories.reset_index(inplace=True)
    
    
    if(anchor==0):
        trajectories = filter_entries_count(trajectories, lambda x: x["level_key"])
    
    anchors = sota[sota["ratio"] == -1]
    anchors = pd.DataFrame(anchors.groupby(["level_key", "date"])["task", "dataset", "metric", "ratio"].apply(agg))
    anchors.sort_values(by=["date"], ascending=True)
    anchors.reset_index(inplace=True)
    
    
        
    return anchors, trajectories

def filter_entries_count(df, fn, gt=1):
    df["foo"] = fn(df)
    count_df = pd.DataFrame(df["foo"].value_counts())
    df = df[df["foo"].isin(count_df[count_df["foo"] > gt].index)]
    df = df.drop("foo", axis=1)
    return df


def plot_global_map(level_keys, anchors, trajectories, class_label, ito, anchor, grp=False, page=None):

    anchors=anchors[anchors["level_key"].isin(level_keys)].copy()
    trajectories=trajectories[trajectories["level_key"].isin(level_keys)].copy()
    
    rows = len(trajectories["level_key"].unique())*25

    ## ADD Lines
    trajectories = trajectories.sort_values(by=["level_key"], ascending=False)
    fig_traj = px.line(
                       x=trajectories["date"], 
                       y=trajectories["level_key"], 
                       color=trajectories["level_key"], 
                       )
    trajectories = trajectories.sort_values(by=["level_key", "date"])
    
    ## ADD ANCHORS
    #select anchors that belong to selected trajectories
    fig_traj.add_trace(
        go.Scatter(
            x=anchors["date"],
            y=anchors["level_key"],
            #facet_row="task",
            #facet_row_spacing=0.009, 
            mode="markers",
            name=None,
            marker=dict(
                symbol=42, 
                size=20,
                line=dict(
                    width=2
                ),
                
            ),
            hovertemplate=
            "<BR>task: "
            + anchors["level_key"]
            + "<BR>date: "
            + anchors["date"].astype("string")
            + "<BR>Anchor."
            + "<BR>benchmarks:<BR>"
            + anchors["dataset"].astype("string"),
        )
    )

    ## ADD Trajectories
    fig_traj.add_trace(
        go.Scatter(
            x=trajectories["date"],
            y=trajectories["level_key"],
            #facet_row="task",
            #facet_row_spacing=0.009, 
            mode="markers",
            name=None,
            hovertemplate=
            "<BR>task: "
            + trajectories["level_key"]
            + "<BR>date: "
            + trajectories["date"].astype("string")
            + "<BR>ratio: "
            + trajectories["ratio"].astype("string")
            + "<BR>benchmarks:<BR>"
            + trajectories["dataset"].astype("string"),
            marker=dict(
                size=17,  
                symbol="diamond-tall",  # https://plotly.com/python/marker-style/
                opacity=0.7,  # alpha ratio
                color=trajectories["ratio"],  # set color equal to a variable
                colorscale="YlGn",  # one of plotly colorscales
                colorbar=dict(title="ratio", lenmode="pixels", len=500, thickness=10, x=-0.1),
                showscale=True,
            ),
        )
    )
   
    fig_traj.update_traces(
        marker=dict(line=dict(color="gray", width=1)),
        line  =dict(width=0, color="black")        
    )

    fig_traj.update_xaxes(showgrid=True, gridcolor="lightBlue", title="Year")
    #title=ito+": "+class_label
    fig_traj.update_yaxes(showgrid=True, gridcolor="lightBlue", title=None)
    
    # font_size 14, height=rows*1.5
    # cv, single traces --> height=rows*1.5
    fig_traj.update_layout(
        #title="Trajectory for ratio (task per year)",
        title_text=class_label,
        showlegend=False,
        font_size=14,
        plot_bgcolor="white",
        height=(rows*(1 if (anchor and ito == "ITO_00101") else 1)) if not grp else 1.35*rows,
        width=1500,
        xaxis=dict(
            tickmode="auto",
        ),
        yaxis={'side': 'right'}
        
    )  

    
    fig_traj.update_layout(

        title={
            'y':0.995,            
            })

    fig_traj.write_html(f"artefacts/{class_label.replace(' ', '_').lower()}{'_single_arrow' if anchor else ''}{'_grp' if grp else ''}{'_' + str(page) if page is not None else ''}.html", include_plotlyjs="cdn")
    fig_traj.write_image(f"artefacts/{class_label.replace(' ', '_').lower()}{'_single_arrow' if anchor else ''}{'_grp' if grp else ''}{'_' + str(page) if page is not None else ''}.svg", scale=2)
    fig_traj.write_image(f"artefacts/{class_label.replace(' ', '_').lower()}{'_single_arrow' if anchor else ''}{'_grp' if grp else ''}{'_' + str(page) if page is not None else ''}.png", scale=2)

    return fig_traj

#Define get statistics function
def get_statistics(traj_df):
  results=pd.DataFrame(columns=['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max'])

  #traj_df = traj_df_bkp.copy()

  traj_df['date'] = pd.to_datetime(traj_df['date'])
  traj_df['date'] = traj_df['date'].dt.strftime('%Y')

  for date in traj_df["date"].unique():
    
    df = traj_df[traj_df["date"]==date].copy()
    ts = df.get(['date','ratio'])
    ts['date']= pd.to_datetime(ts['date'])
    ts['date'] = ts['date'].dt.strftime('%Y-%m')
    
    
    #print(results.index)
    year = ts.describe()
    year["ratio"]=year["ratio"].astype('float').round(3)

    year = year.T
    year.index=[date]
    results=results.append(year)

  #year["ratio"].values.round(3).astype('float')\
  results = results.sort_index()
  return results

#Define get boxplots
def get_boxplot(traj_df):

  traj_df['date'] = pd.to_datetime(traj_df['date'])
  traj_df['date'] = traj_df['date'].dt.strftime('%Y')

  c = ['hsl('+str(h)+',50%'+',50%)' for h in np.linspace(0, 360, 20)] #here controls the colors


  fig = go.Figure()
  # Use x instead of y argument for horizontal plot
  i=0
  for date in traj_df["date"].unique():
    i=i+1
    df = traj_df[traj_df["date"]==date].copy()
    ts = df.get(['date','ratio'])

    ts['date']= pd.to_datetime(ts['date'])
    ts['date'] = ts['date'].dt.strftime('%Y-%m')
    #fig.add_trace(go.Box(x=ts["ratio"], name=task))
    fig.add_trace(go.Box(y=ts["ratio"], 
              boxpoints='all',
              jitter=0.8,
              whiskerwidth=0.1,
              marker_size=3, 
              line_width=2,
              name=int(date), 
              marker_color="Blue"))
    
  fig.update_layout(height=400, width=1000, showlegend=False,
                    font_size=20,
                    xaxis=dict(tickmode='linear'))
  
  fig.update_xaxes(categoryorder='array', categoryarray=list(range(YEAR_START,YEAR_NOW)))
                  
  return fig

## Aggregated superclasses

In [29]:
for ito, class_label in [("ITO_00141", "Natural Language Processing"), ("ITO_00101", "Vision process")]:
    anchor = 0
    anchors, trajectories = prep_data(ito, anchor, True, 500, write_level_keys=True)

    max_rows_per_page = 35
    level_keys_pages = anchors["level_key"].sort_values().unique()
    level_keys_pages = [level_keys_pages[i:i+max_rows_per_page] for i in range(0,len(level_keys_pages), max_rows_per_page)]

    for page, level_keys in enumerate(level_keys_pages):
        global_plt = plot_global_map(level_keys, anchors, trajectories, class_label, ito, anchor, True, None if len(level_keys_pages) == 1 else page)

    #boxplots = get_boxplot(trajectories)
    #results = get_statistics(trajectories)

    print(f"Comparative yearly distribution of state-of-the-art (SOTA) averaged gain ratio values - {class_label}")
    # global_plt.show()
    # boxplots.show()
    # display(results.T.style)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Comparative yearly distribution of state-of-the-art (SOTA) averaged gain ratio values - Natural Language Processing




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Comparative yearly distribution of state-of-the-art (SOTA) averaged gain ratio values - Vision process
