# Importing necessary packages

In [4]:
import pandas as pd  # For handling the data

import seaborn as sns  # For plotting
import matplotlib.pyplot as plt  # for subplots
import plotly_express as px  # for interactiv plots

# import warnings
# warnings.filterwarnings(action="ignore")

import plotly.io as pio
pio.templates.default = "gridon"  # Sets "gridon" as default for all Plotly graphs


# KPI 1

- amount of cases per each age group.
- amount of deaths in relation to cases per age group. intensive care patients
___
# Hypothesis

- I imagine that ages 20-40 will be the most likely to get sick but must likely to survive the virus

- There will be higher amounts of deaths as age goes up

- If the intensive care number is high. so should deaths.
___

In [5]:
df_age_group = pd.read_excel("../Lab_Data/Covid19.xlsx", sheet_name="Totalt antal per åldersgrupp")
#no nulls. "Ålder is object. rest are int."¨

df_age_group.drop([10][0], inplace = True) # Dropping "Uppgift saknas because it is negligible amount"

# Renaming columns to translate them into english
df_age_group.rename(
    columns={
        "Åldersgrupp": "Age group",
        "Totalt_antal_fall": "Total amount of cases",
        "Totalt_antal_intensivvårdade": "Total amount in intensive care",
        "Totalt_antal_avlidna": "Total deaths",
    },
    inplace=True,
)

# Translating Age group rows into english
df_age_group.replace(
    {
        "Ålder_0_9": "Ages 0-9",
        "Ålder_10_19": "Ages 10-19",
        "Ålder_20_29": "Ages 20-29",
        "Ålder_30_39": "Ages 30-39",
        "Ålder_40_49": "Ages 40-49",
        "Ålder_50_59": "Ages 50-59",
        "Ålder_60_69": "Ages 60-69",
        "Ålder_70_79": "Ages 70-79",
        "Ålder_80_89": "Ages 80-89",
        "Ålder_90_plus": "Age 90 plus",
    },
    inplace=True,
)


In [6]:
def percentage_func(df, column: str, lst, column2 = "NA",):
    """Function to get precentages out of a Dataframe and save it to a list.
    column2 can be used when you need percentages between two columns."""
    if column2 == "NA":
        col_sum = df[column].sum()
        for i in range(len(df[column])):
            lst.append(round(df[column][i] / col_sum*100))
    else:
        col_sum2 = df[column2].sum()
        for i in range(len(df[column])):
             lst.append(round(df[column][i] / col_sum2 *100))

In [15]:
cases_percent = [] # checking amount of cases procenteges by age group
intensive_care_percent = []
total_dead_percent = []

percentage_func(df_age_group, "Total amount of cases", cases_percent,"NA")
percentage_func(df_age_group, "Total amount in intensive care", intensive_care_percent)
percentage_func(df_age_group, "Total deaths", total_dead_percent)

cases_percent, intensive_care_percent, total_dead_percent

([5, 14, 16, 19, 18, 14, 7, 3, 2, 1],
 [1, 1, 3, 5, 10, 20, 27, 25, 6, 0],
 [0, 0, 0, 0, 1, 3, 7, 23, 40, 26])

In [22]:
fig = px.bar(
    x=df_age_group["Age group"],
    y=[cases_percent, intensive_care_percent, total_dead_percent],
    labels = {"x": "Age groups", "variable": "Datapoint(s)", "value": "Percentage of people"},
    text_auto=True, # Adds the values above the bars
    barmode="group", # groups the bars in plot
    title="Amount of cases, deaths and patients in intensive care, by percent",
    width = 1080, # Sets the size so that the graph is consistent for every user that runs the program
    height = 540,
    # pattern_shape = df_age_group["Åldersgrupp"],
    # pattern_shape_sequence=["/","/","/","/","/","/","/","/"],
)
bar_names = {
    "wide_variable_0": "Sick per age group per percentage",  # Todo fix names.
    "wide_variable_1": "Intensive care patients per percentage",
    "wide_variable_2": "Deaths per age group",
}
fig.update_traces( # rotates text and puts text outside of bar.
    textposition="outside",
)
fig.for_each_trace(
    lambda t: t.update(
        name=bar_names[t.name],
        legendgroup=bar_names[t.name],
        hovertemplate=t.hovertemplate.replace(t.name, bar_names[t.name]),
    )
)
fig.update_layout(  # https://plotly.com/python/setting-graph-size/
    margin=dict(l=40, r=50, t=60, b=70), # Changes the margins to make the graph look the for everyone who runs the program
    paper_bgcolor="white",
)
fig.add_annotation(  # adds an line and text to the barplot.
    text="age 80-89 is responsible for        %       of all deaths",
    x=8,
    y=38,  
)

fig.show()

# Findings
Very interesting to see that almost 0 percent of ages 90 > gets intensive care.

As I Thought, Ages 10-59 (Ages that are most active and meets with the most people on average) gets sick the most but there are almost no deaths compared to 59 and above.

## KPI 2
___
Gapminder style graph of each regional council, deaths and intensive care patients.
The point of this graph is to get an interesting, interactive and visual view of: 
- The time and where the pandemic was worst



In [23]:
df_lan = pd.read_excel("../Lab_Data/Covid19.xlsx", sheet_name="Veckodata Region")
df_lan.insert(0, "Vecka", df_lan["år"].astype(str) + "v" + df_lan["veckonummer"].astype(str)) # Cleaning data to get weeks and years combined again.
df_lan.pop("år") # Remove column year because it is not needed any more
df_lan.pop("veckonummer") # same as above

# RENAME

0        1
1        2
2        3
3        4
4        5
        ..
3082    38
3083    39
3084    40
3085    41
3086    42
Name: veckonummer, Length: 3087, dtype: int64


Cases per Län, = Size
x = "Kumulative cases"
y = Cumulative deaths
Color = Län
animation_frame = "year"
animation_group = "country"

In [177]:
df_lan.head()

Unnamed: 0,Vecka,Region,Antal_fall_vecka,Kum_antal_fall,Antal_intensivvårdade_vecka,Kum_antal_intensivvårdade,Antal_avlidna_vecka,Kum_antal_avlidna,Antal_fall_100000inv_vecka,Kum_fall_100000inv
0,2020v1,Blekinge,0,0,0,0,0,0,0,0
1,2020v2,Blekinge,0,0,0,0,0,0,0,0
2,2020v3,Blekinge,0,0,0,0,0,0,0,0
3,2020v4,Blekinge,0,0,0,0,0,0,0,0
4,2020v5,Blekinge,0,0,0,0,0,0,0,0


In [181]:
df_lan["Antal_avlidna_vecka"].max()

375

In [183]:
df_lan["Antal_intensivvårdade_vecka"].max()

124

In [201]:
df_lan.rename(
    columns={
        "Antal_avlidna_vecka": "Amount dead per week",
        "Antal_fall_vecka": "Amount of cases per week",
        "Kum_antal_fall": "Cumulative amount of cases",
        "Antal_intensivvårdade_vecka": "Amount of patients in intensive care",
    },
    inplace=True,
)

In [202]:
fig = px.scatter( # fix... 
    df_lan,
    x="Amount dead per week",
    y="Amount of patients in intensive care",
    size="Amount of cases per week",
    log_x=True,
    log_y=True,
    size_max=250,
    color="Region",
    animation_frame = "Vecka",
    animation_group = "Region",
    title="Deaths and patients in intensive care per week, Press the stop button and hover over the circles to get more information. ",
    range_x = [1,375],
    range_y = [1,124],
)

fig.show()
# I wish that the colors showed in the Region bar on the right without clicking on one of the regions, but it seems to be the result of a bug in plotly express.

# Findings
- Stockholm, Skåne and Västra Götaland have the most deahts, self explanatory because of the population
- year 2021 week 23 to week 30 was a period of almost no new cases, IVA patients or deaths.

## KPI 3
___
- Top 5 intensive care and deaths, to see which weeks were most deadly. 
- and if there is a correlation between intensive care patients and deaths.

In [None]:
# Read in data and clean it as I did in task 1

kingdom_all = pd.read_excel("../Lab_Data/Covid19.xlsx", sheet_name="Veckodata Riket")
kingdom_all.insert(0,"Week",kingdom_all["år"].astype(str)+"v"+kingdom_all["veckonummer"].astype(str))
kingdom_all.pop("år")
kingdom_all.pop("veckonummer")

In [162]:
kingdom_all.head(3)

Unnamed: 0,Week,Antal_fall_vecka,Antal_fall_100000inv_vecka,Antal_fall_100000inv_14dagar,Kum_antal_fall,Kum_fall_100000inv,Antal_nyaintensivvårdade_vecka,Kum_antal_intensivvårdade,Antal_avlidna_vecka,Antal_avlidna_milj_inv_vecka,Kum_antal_avlidna,Kum_antal_avlidna_milj_inv
0,2020v6,1,0,0,1,0,0,0,0,0.0,0,0.0
1,2020v7,0,0,0,1,0,0,0,0,0.0,0,0.0
2,2020v8,0,0,0,1,0,0,0,0,0.0,0,0.0


In [176]:
# Using method chaining to get the 5 indexes of max values in intensive care and deaths.
top_5_intensive = kingdom_all["Antal_nyaintensivvårdade_vecka"].sort_values(ascending=False).head().index 
top_5_deaths = kingdom_all["Antal_avlidna_vecka"].sort_values(ascending=False).head().index 


In [59]:
kingdom_all.loc[9]["Week"]

'2020v15'

In [104]:
# Creating a loop to get a new dataframe with the correct indexes.
temp_top_5_intensive = []
temp_top_5_deaths = []
for value in top_5_intensive:
    temp_top_5_intensive.append(kingdom_all.loc[value])

for value in top_5_deaths:
    temp_top_5_deaths.append(kingdom_all.loc[value])



In [170]:
df_top_intensive = pd.DataFrame(temp_top_5_intensive, columns= ["Week","Antal_nyaintensivvårdade_vecka"]) # Converting list into dataframe
df_top_deaths = pd.DataFrame(temp_top_5_deaths, columns= ["Week", "Antal_avlidna_vecka"])
df_top_intensive.sort_values(by="Week",inplace=True) # sorting after week
df_top_deaths.sort_values(by="Week",inplace=True)
#Renaming columns: 
df_top_intensive.rename(columns= { "Antal_nyaintensivvårdade_vecka": "Amount of intensive care patients per week"},inplace=True)
df_top_deaths.rename(columns= { "Antal_avlidna_vecka": "Amount of deaths per week"},inplace=True)


In [171]:
# Combining previous data sets. 
df_deaths_IVA = pd.concat([df_top_deaths,df_top_intensive],join="outer",ignore_index=False)
df_deaths_IVA.head(10)


Unnamed: 0,Week,Amount of deaths per week,Amount of intensive care patients per week
9,2020v15,664.0,
10,2020v16,657.0,
45,2020v51,645.0,
47,2020v53,663.0,
48,2021v1,644.0,
8,2020v14,,277.0
9,2020v15,,289.0
10,2020v16,,250.0
61,2021v14,,250.0
62,2021v15,,246.0


In [174]:
px.bar(df_top_intensive, x = "Week", y= "Amount of intensive care patients per week",title = "Top 5 weeks most people needed intensive care",text_auto=True)


In [175]:
px.bar(df_top_deaths, x = "Week", y= "Amount of deaths per week",title = "Top 5 weeks most people died",text_auto=True)


# Findings:
- The initial shock of the pandemic is clear looking at 2020v14 and 15. Most people got put into intensive care at this point.
- Most people also died one week later.
- 
- Intensive care patients increases when spring starts at the end of march beginning of april. 
- After the initial shock when Covid19 in 2020 there is no longer a clear correlation between intensive care patients and deaths.

## KPI 4
- Correlation between sexes, 

In [224]:
df_sex = pd.read_excel("../Lab_data/Covid19.xlsx", sheet_name="Totalt antal per kön")
df_sex.drop(df_sex.tail(1).index, inplace=True) # remove last row since its 'unknown' and statistically insignific

In [226]:
df_sex 

Unnamed: 0,Kön,Totalt_antal_fall,Totalt_antal_intensivvårdade,Totalt_antal_avlidna
0,Man,1210098,6550,11279
1,Kvinna,1381189,2947,9273


In [None]:
df_sex.rename(columns= { "Antal_nyaintensivvårdade_vecka": "Amount of intensive care patients per week"},inplace=True)

In [225]:
def log_plotly_bar_with_df_title_labels(df, y, x, title, labels, save_file):
    fig = px.bar(
        df,
        y=y,
        x=x,
        barmode="group",  # groups the bars next to eachother instead of stacking on eachother
        labels=labels,
        title=title,
        log_y=True,  # easier to read, makes y-axes logarithmic
    )

    fig.show()
    fig.write_html("../Labb_1/Visualiseringar/" + save_file + ".html")


labels_antal_fall_kön = {"value": "Befolkning", "variable": "Antal fall av Covid-19"}

log_plotly_bar_with_df_title_labels(
    df_sex,
    ["Totalt_antal_fall", "Totalt_antal_intensivvårdade", "Totalt_antal_avlidna"],
    "Kön",
    "Antal fall per kön",
    labels_antal_fall_kön,
    "KPI_1_Antal_fall_per_kön-",
)



FileNotFoundError: [Errno 2] No such file or directory: '..\\Labb_1\\Visualiseringar\\KPI_1_Antal_fall_per_kön-.html'