In [None]:
# We start by importing pandas and creating a Path object that points to our files.
import pandas as pd
from pathlib import Path

dirpath = Path("../../datasets/kontali")

# Product

### Read the file `product.csv` into a pandas DataFrame

Hints:
1. the function to use here is `pd.read_csv`
1. we can specify the path to `product.csv` with `os.path.join` or `pathlib.Path` from the builtin modules
1. we must use the keyword argument `encoding="latin-1"`
1. we want to set the index to be the `Product_Code` column
1. we also want to set the `delimiter` keyword argument

In [None]:
df_product = pd.read_csv(dirpath / "product.csv",
                         encoding="latin-1", 
                         delimiter=";", 
                         index_col="Product_Code",
                        )
df_product.head()

### How many product categories are there?

In [None]:
len(df_product), df_product.shape

### Make a selection dataframe that only contains trout products

In [None]:
df_product.loc[df_product["Species_Code"] == "TRR"]

### Sort dataframe by product code

In [None]:
df_product.sort_index()

### Sort dataframe by species code (lexicographically)

In [None]:
df_product.sort_values("Species_Code")

### Sort dataframe by species code, then presentation, then preservation (lexicographically)

In [None]:
df_product.sort_values(["Species_Code", "Presentation", "Preservation"])

### List all the trout product categories by only using the "Product_Description_KA" column?

Do not use the `Species_Code` column for this task.

Hint: it should be sufficient to check if "trout" is mentioned in the `Product_Description_KA` column.

In [None]:
trout_idx = df_product["Product_Description_KA"].str.contains("trout")

df_product[trout_idx]

### Make a new column named "Head" with a category dtype. Possible values should be YES, NO and UNKNOWN.

In [None]:
df = df_product.copy()
head_on = df.Product_Description_KA.str.contains("head on")
head_off = df.Product_Description_KA.str.contains("head off")

df["Head"] = pd.Categorical(["unknown"]*len(df), categories=["yes", "no", "unknown"])
df.loc[head_on, "Head"] = "yes"
df.loc[head_off, "Head"] = "no"
df


### [Challenging] Can you recreate the "Preservation" column by using the "Product_Description_KA" column and the below dict named `keywords`? 

```python
keywords = {
    "PRS": ["brine", "canned", "smoked", "airtight"],
    "FRO": ["frozen"],
    "FRE": ["fresh"],
    "ALI": ["live"],
}
```

We don't have a solution suggestion for this one, you are on your own! 
__Our recommendation is to skip it for now.__

In [None]:
# Suggestion:
# iterate over keywords keys
# iterate over keyword value list
# look for keyword value element ("brine") with str.contains()

keywords = {
    "PRS": ["brine", "canned", "smoked", "airtight"],
    "FRO": ["frozen"],
    "FRE": ["fresh"],
    "ALI": ["live"],
}

# SSB

### Read the file `ssb_export.csv` into a pandas DataFrame

Hints:
1. the function to use here is `pd.read_csv`
1. we can specify the path to `ssb_export.csv` with `os.path.join` or `pathlib.Path` from the builtin modules
1. we must use the keyword argument `encoding="latin-1"`
1. we want to set the index to be the `ID` column
1. we also want to set the `delimiter` keyword argument

In [None]:
df_ssb = pd.read_csv(dirpath / "ssb_export.csv",
                     encoding="latin-1",
                     delimiter=";",
                     index_col="ID"
                    )
df_ssb.tail()

### Use DataFrame.describe() to find the maximum `Mengde` value from all of the records (rows)

In [None]:
df_ssb.describe()

### How many transactions are there in total?

In [None]:
len(df_ssb), df_ssb.shape[0] # use either of the two

### How many columns are there?

In [None]:
len(df_ssb.columns), df_ssb.shape[1] # use either of the two

### Which years do the transactions cover

In [None]:
df_ssb.År.unique().tolist()

### How many transactions were there in 2020?

In [None]:
len(df_ssb[df_ssb["År"] == 2020])

### What was the largest single transaction in terms of value?

In [None]:
df_ssb.Verdi.max()

### What is the ID of this transaction?

In [None]:
df_ssb.Verdi.idxmax()

### What year was this transaction?

Try to not directly use the ID from previous answer.

In [None]:
df_ssb.loc[df_ssb.Verdi.idxmax()].År

### Does the dataframe contain both import and export transactions?

In [None]:
df_ssb.Vareflyt.unique()

### Make a selection of Canadian transactions only

In [None]:
df_ssb_ca = df_ssb[df_ssb["Landkode"] == "CA"]

### Make a selection of Canadian transactions only, for year 2022

And bind the returned dataframe to this variable name: `df_ssb_ca`. It will be used in the succeding cell.

In [None]:
df_ssb_ca = df_ssb[(df_ssb["År"] == 2022) & (df_ssb["Landkode"] == "CA")]

### Calculate the total weight and value of the above selection for each product number ("Varenr")

Hint: You can use `df_ssb_ca.groupby()` to  group the above selection by product number ("Varenr"). You can use the `sum()` aggregator on the `Grouper` object.

__If DataFrame.groupby is not yet been covered, you can skip this task__.

In [None]:
df_ssb_ca.groupby("Varenr").sum()[["Mengde", "Verdi"]]

### What was the total export in kg for Smoked salmon in 2020? (Are you able to find it with a single line of code?)

In [None]:
df_ssb[(df_ssb["Varenr"] == 3054100) & (df_ssb["År"] == 2020)].Mengde.sum()

### Calculate the average price (NOK/kg) for Fresh Pacific Salmon in 2019 

Hint: make a `price/kg` column using `df["Verdi"]` and `df["Mengde"]`

In [None]:
df = df_ssb[(df_ssb["Varenr"] == 3044100) & (df_ssb["År"] == 2019)]
(df["Verdi"] / df["Mengde"]).mean()

### Bonus: Make a bar chart of the average price for Fresh Pacific Salmon by year

Assuming that "Verdi" is all in the same currency.

Hint: make a `price/kg` column using `df["Verdi"]` and `df["Mengde"]`

In [None]:
df = df_ssb[df_ssb["Varenr"] == 3044100].copy()
df["price/kg"] = df["Verdi"] / df["Mengde"]
df.groupby("År").mean()["price/kg"].plot(kind="bar");

---
# Memory usage

https://pandas.pydata.org/docs/user_guide/gotchas.html#dataframe-memory-usage

### How much memory does this dataframe use?

For this question, you can use `DataFrame.info()` and read the memory usage from there. Alternatively you can use `DataFrame.memory_usage(deep=True).sum()`.

Try `DataFrame.memory_usage().sum()` (without `deep=True`). Why is the reported memory usage lower now?

Hint: look at the official pandas documentation for `DataFrame.memory_usage()`

In [None]:
df_ssb.memory_usage(deep=True).sum()

### The `Landkode` column has the `object` dtype. How much is the memory usage reduced (in percent) if `Landkode` dtype is changed to `Categorical`? 

In [None]:
mem_obj = df_ssb["Landkode"].memory_usage(deep=True)
mem_cat = df_ssb["Landkode"].astype('category').memory_usage(deep=True)

# reduction in percent for "Landkode" column
100*(1-mem_cat/mem_obj)

### The `Vareflyt` column has the `object` dtype. How much is the memory usage reduced (in percent) if `Vareflyt` dtype is changed to `StringDtype`? 

In [None]:
mem_obj = df_ssb["Vareflyt"].memory_usage(deep=True)
mem_str = df_ssb["Vareflyt"].astype("string").memory_usage(deep=True)

# reduction in percent for "Vareflyt" column
100*(1-mem_cat/mem_obj)

---

### CHALLENGING: 

Assuming that the df_product["Verdi"] is in nok and all exports were traded in local recipient currency,  what was the total export value, in the various local currencies, for Smoked salmon in 2020? 

We don't have a solution suggestion for this one, you are on your own!

You probably will need to use the currency dataframe found in:
`course-kontali-2023/pandas notebooks/datasets/kontali/product.csv`.

In [None]:
# I would start with something like this
df_ssb[(df_ssb["Varenr"] == 3054100) & (df_ssb["År"] == 2022)]