# YOUR PROJECT TITLE

> **Note the following:** 
> 1. This is *not* meant to be an example of an actual **data analysis project**, just an example of how to structure such a project.
> 1. Remember the general advice on structuring and commenting your code
> 1. The `dataproject.py` file includes a function which can be used multiple times in this notebook.

Imports and set magics:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import ipywidgets as widgets
import seaborn as sns

# autoreload modules when code is run
%load_ext autoreload
%autoreload 2

# user written modules
import dataproject


# Read and clean data

In [2]:
# importing data
alcohol = pd.read_csv('alcohol.csv')
life_exp_65 = pd.read_csv('lifeexp65.csv')
overobese = pd.read_csv('overobese.csv')
smokers = pd.read_csv('smokers.csv')
social_support = pd.read_csv('socsupport.csv')

In [3]:
# clean alcohol data
drop = ['INDICATOR','SUBJECT','MEASURE','FREQUENCY','Flag Codes']
alcohol.drop(drop, axis=1, inplace=True)

alcohol.rename(columns={'LOCATION':'country', 'TIME':'year', 'Value':'alcohol_sale'}, inplace=True)


# figure (alcohol)
def plot(df, country):
    I=df['country']==country
    ax=df.loc[I,:].plot(x='year', y='alcohol_sale', legend=False)

widgets.interact(plot, 
    df = widgets.fixed(alcohol),
    country = widgets.Dropdown(description='country', 
                                    options=alcohol.country.unique(), 
                                    value='AUS')
);

interactive(children=(Dropdown(description='country', options=('AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN'…

In [4]:
alcohol_wide = pd.pivot(alcohol,index='country',columns='year',values='alcohol_sale')

col_dict = {}
for i in range(1960,2022):
    col_dict[str(i)] = f'alc{i}'

col_dict = {str(i) : f'alc{i}' for i in range(1960,2022)}

alcohol_wide.rename(columns =col_dict, inplace=True)

#drop_list=[*range(1960,2000)]
#drop_list.append(2021)

#alcohol_wide.drop(drop_list, axis=1, inplace=True)



In [5]:
# clean life_exp_65 data
drop = ['INDICATOR','MEASURE','FREQUENCY','Flag Codes']
life_exp_65.drop(drop, axis=1, inplace=True)

life_exp_65.rename(columns={'LOCATION':'country', 'SUBJECT':'sex','TIME':'year', 'Value':'exp_years'}, inplace=True)

life_exp_65_wide = pd.pivot_table(life_exp_65,index='country',columns='year',values='exp_years')



In [6]:
# figure (life_exp_65)
def plot(df, country):
    I=df['country']==country
    ax=df.loc[I,:].plot(x='year', y='exp_years', legend=False)

widgets.interact(plot, 
    df = widgets.fixed(life_exp_65_wide),
    country = widgets.Dropdown(description='country', 
                                    options=life_exp_65_wide.country.unique(), 
                                    value='AUS')
);

#life_exp_65_long = pd.melt(life_exp_65_wide, id_vars='country',value_vars='year')

AttributeError: 'DataFrame' object has no attribute 'country'

In [7]:
# clean overobese data
drop = ['INDICATOR','MEASURE','FREQUENCY','Flag Codes']
overobese.drop(drop, axis=1, inplace=True)

overobese.rename(columns={'LOCATION':'country', 'TIME':'year', 'Value':'share_obese'}, inplace=True)

# lav funktion som vælger enten self-reported eller den anden

# figure (overobese)
def plot(df, country):
    I=df['country']==country
    ax=df.loc[I,:].plot(x='year', y='share_obese', legend=False)

widgets.interact(plot, 
    df = widgets.fixed(overobese),
    country = widgets.Dropdown(description='country', 
                                    options=overobese.country.unique(), 
                                    value='AUS')
);

interactive(children=(Dropdown(description='country', options=('AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'FIN', 'FRA'…

In [8]:
# clean smokers data
I = smokers.SUBJECT.str.contains('WOMEN')
I |= smokers.SUBJECT.str.contains('MEN')
smokers.loc[I,:]
smokers = smokers.loc[I == False]

drop = ['INDICATOR','SUBJECT','MEASURE','FREQUENCY','Flag Codes']
smokers.drop(drop, axis=1, inplace=True)

smokers.rename(columns={'LOCATION':'country', 'TIME':'year', 'Value':'smoking'}, inplace=True)


# figure (smokers)
def plot(df, country):
    I=df['country']==country
    ax=df.loc[I,:].plot(x='year', y='smoking', legend=False)

widgets.interact(plot, 
    df = widgets.fixed(smokers),
    country = widgets.Dropdown(description='country', 
                                    options=smokers.country.unique(), 
                                    value='AUS')
);

interactive(children=(Dropdown(description='country', options=('AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN'…

In [9]:
# clean social_support data
I = social_support.SUBJECT.str.contains('TOT')
social_support.loc[I,:]
social_support = social_support.loc[I == True]

drop = ['INDICATOR','SUBJECT','MEASURE','FREQUENCY','Flag Codes']
social_support.drop(drop, axis=1, inplace=True)

social_support.rename(columns={'LOCATION':'country', 'TIME':'year', 'Value':'share_support'}, inplace=True)


# figure (social_support)
def plot(df, country):
    I=df['country']==country
    ax=df.loc[I,:].plot(x='year', y='share_support', legend=False)

widgets.interact(plot, 
    df = widgets.fixed(social_support),
    country = widgets.Dropdown(description='country', 
                                    options=social_support.country.unique(), 
                                    value='AUS')
);

interactive(children=(Dropdown(description='country', options=('AUS', 'AUT', 'BEL', 'CAN', 'CZE', 'DNK', 'FIN'…

In [10]:
import numpy as np
import pandas as pd
import textwrap

import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import IPython.display
from IPython.display import display, clear_output

import plotly.graph_objects as go

In [11]:
# overview
alcohol_wide
#life_exp_65_wide
#overobese_wide
#smokers_wide
#social_support_wide

year,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ARG,,,,,,,,,,,...,8.0,8.3,7.9,8.5,8.4,8.3,8.0,8.0,,
AUS,9.3,9.4,9.5,9.8,10.1,10.0,10.3,10.8,11.1,11.6,...,10.0,9.9,9.7,9.8,9.5,9.5,,,,
AUT,8.8,9.4,9.4,10.1,10.7,11.2,13.0,12.7,13.4,13.2,...,12.1,11.9,12.2,11.2,11.4,11.3,11.3,11.6,11.3,
BEL,,,,9.9,10.5,10.8,10.3,10.7,11.2,11.5,...,10.1,10.3,10.6,10.4,9.4,9.3,9.2,9.2,,
BGR,,,,,,,,,,,...,10.9,10.3,10.3,10.9,11.1,10.9,11.5,11.2,,
BRA,,,,1.9,2.0,1.9,1.8,2.6,2.7,2.7,...,7.1,6.9,6.9,6.6,6.4,6.3,6.2,6.1,,
CAN,7.0,7.1,7.3,7.5,7.6,7.8,8.1,8.3,8.3,8.5,...,8.3,8.2,8.0,8.0,8.1,8.2,8.2,8.0,8.1,
CHE,12.1,12.7,12.9,13.1,13.8,13.7,13.6,13.6,13.6,14.0,...,9.9,9.8,9.5,9.5,9.3,9.2,9.1,9.3,8.9,
CHL,,,,11.1,11.6,9.7,11.3,11.4,11.6,9.1,...,8.3,7.2,7.9,7.9,6.4,5.7,6.3,7.1,,
CHN,,0.5,0.6,0.6,0.6,0.7,0.8,0.7,0.8,0.8,...,5.5,5.6,5.7,5.7,5.6,5.3,4.8,4.5,,


In [None]:
#alcohol_wide.reset_index(level='country', inplace=True)


## Explore each data set

In order to be able to **explore the raw data**, you may provide **static** and **interactive plots** to show important developments 

**Interactive plot** :

In [None]:
def plot_func():
    # Function that operates on data set
    pass

widgets.interact(plot_func, 
    # Let the widget interact with data through plot_func()    
); 


interactive(children=(Output(),), _dom_classes=('widget-interact',))

Explain what you see when moving elements of the interactive plot around. 

# Merge data sets

Now you create combinations of your loaded data sets. Remember the illustration of a (inner) **merge**:

In [12]:
#merging the data
smokeralc = pd.merge(smokers,alcohol,on=['country','year'],how='outer')


In [13]:
merged = pd.merge(smokeralc, social_support,on=['country','year'],how='outer')
merged


Unnamed: 0,country,year,smoking,alcohol_sale,share_support
0,AUS,1964,43.0,10.1,
1,AUS,1966,40.0,10.3,
2,AUS,1969,37.0,11.6,
3,AUS,1974,37.8,13.1,
4,AUS,1976,38.4,13.1,
...,...,...,...,...,...
2623,RUS,2020,,,88.545778
2624,RUS,2021,,,85.969644
2625,SVN,2021,,,95.398196
2626,SVN,2022,,,93.825366


In [28]:
merged.set_index(['country','year'])


Unnamed: 0_level_0,Unnamed: 1_level_0,smoking,alcohol_sale,share_support
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,1964,43.0,10.1,
AUS,1966,40.0,10.3,
AUS,1969,37.0,11.6,
AUS,1974,37.8,13.1,
AUS,1976,38.4,13.1,
...,...,...,...,...
RUS,2020,,,88.545778
RUS,2021,,,85.969644
SVN,2021,,,95.398196
SVN,2022,,,93.825366


In [20]:
def plot(df, country, smoking):
    I=df['country']== country
    ax=df.loc[I,:].plot(x='year', y='smoking', legend=False) 
    
def plot_timeseries(df):
    widgets.interact(plot, 
    df = widgets.fixed(df),
    var = widgets.Dropdown(
        description='variable', 
        options=['smoking','alcohol_sale','share_support'], 
        value='smoking'),
    con = widgets.Dropdown(
        description='country', 
        options=[social_support.country.unique(), alcohol.country.unique(),smokers.country.unique()], 
        value='AUS')          
);

In [21]:
plot_timeseries(merged)

TraitError: Invalid selection: value not found

Here we are dropping elements from both data set X and data set Y. A left join would keep all observations in data X intact and subset only from Y. 

Make sure that your resulting data sets have the correct number of rows and columns. That is, be clear about which observations are thrown away. 

**Note:** Don't make Venn diagrams in your own data project. It is just for exposition. 

# Analysis

To get a quick overview of the data, we show some **summary statistics** on a meaningful aggregation. 

MAKE FURTHER ANALYSIS. EXPLAIN THE CODE BRIEFLY AND SUMMARIZE THE RESULTS.

# Conclusion

ADD CONCISE CONLUSION.