In [1]:
from pathlib import Path
from typing import Optional, List, Dict, Tuple
import pandas as pd
import numpy as np
import collections
from who_member_states import WHO_MEMBER_STATES

In [2]:

__all__ = ['select_df']


def select_df(df: pd.DataFrame,
              rename_mapping: Dict[str, str] = None,
              column_drop: Optional[List[str]] = None,
              year: int = 2000,
              save_path: Optional[Path] = None,
              drop_na: Optional[List[str]] = None, ) -> pd.DataFrame:
    df = df.copy()
    if rename_mapping is not None:
        df = df.rename(columns=rename_mapping)
    if column_drop is not None:
        df = df.drop(columns=column_drop)

    year_mask: pd.Series[bool] = df['Year'] >= year

    entity_mask: List[bool] = [country in WHO_MEMBER_STATES for country in df['Country Name']]

    modified_df: pd.DataFrame = df[year_mask & entity_mask].reset_index(
        drop=True)
    if drop_na is not None:
        try:
            modified_df.dropna(subset=drop_na, inplace=True)

        except KeyError as e:
            raise ValueError(f'{e} not in the dataframe, should be one of the {modified_df.columns.tolist()}')
    if save_path is not None:
        modified_df.to_excel(save_path)

    return modified_df


In [5]:
def tobacco_layout_modified(df: pd.DataFrame,
                            save_path: Optional[Path] = None) -> pd.DataFrame:
    dy: dict = collections.defaultdict(list)
    group = df.groupby(['Country Name', 'Year', 'Indicator', 'Sex'])
    info: List[Tuple] = list(group.groups.keys())

    for i, it in enumerate(info):
        dy['Country Name'].append(it[0])
        dy['Year'].append(it[1])
        dy['Indicator'].append(it[2])
        dy['Sex'].append(it[3])
        dy['Prevalence'].append(group.get_group(it).Prevalence.mean())
        
    _df = pd.DataFrame.from_dict(dy)
    sex_values = ['Both sexes', 'Male', 'Female']
    indicator_values=['Estimate of current tobacco use prevalence (%) (age-standardized rate)',
                     'Estimate of current tobacco smoking prevalence (%) (age-standardized rate)']
    
    
    changed_df = _df.assign(All_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate=
                            _df.query("Sex == 'Both sexes'& Indicator =='Estimate of current tobacco use prevalence (%) (age-standardized rate)'")[
            'Prevalence'],
                            Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate=_df.query("Sex == 'Male'& Indicator =='Estimate of current tobacco use prevalence (%) (age-standardized rate)'")[
            'Prevalence'],
                            Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate=_df.query("Sex == 'Female'& Indicator =='Estimate of current tobacco use prevalence (%) (age-standardized rate)'")['Prevalence'],
    
                            All_Estimate_of_Current_Tobacco_Smoking_Prevalence_age_standardized_rate=_df.query("Sex == 'Both sexes'& Indicator =='Estimate of current tobacco smoking prevalence (%) (age-standardized rate)'")[
            'Prevalence'],
                            Male_Estimate_of_Current_Tobacco_Smoking_Prevalence_age_standardized_rate=_df.query("Sex == 'Male'& Indicator =='Estimate of current tobacco smoking prevalence (%) (age-standardized rate)'")[
            'Prevalence'],
                            Female_Estimate_of_Current_Tobacco_Smoking_Prevalence_age_standardized_rate=_df.query("Sex == 'Female'& Indicator =='Estimate of current tobacco smoking prevalence (%) (age-standardized rate)'")['Prevalence'])


    
    changed_df.reset_index(drop=True, inplace=True)

    changed_df = changed_df.drop(['Sex', 'Prevalence','Indicator'], axis=1)
    changed_df = changed_df.groupby(['Country Name', 'Year']).first().reset_index()
    print(changed_df)
    if save_path is not None:
        changed_df.to_excel(save_path)
    return changed_df


In [6]:
if __name__ == '__main__':
    changed_df = pd.read_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/Prevalence of Tobacco use_modified.xlsx')
    save_path= '/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/Prevalence of Tobacco use_Changed_layout.xlsx'
    tobacco_layout_modified(changed_df,save_path=save_path)

     Country Name  Year  \
0     Afghanistan  2000   
1     Afghanistan  2005   
2     Afghanistan  2010   
3     Afghanistan  2015   
4     Afghanistan  2018   
...           ...   ...   
1355     Zimbabwe  2018   
1356     Zimbabwe  2019   
1357     Zimbabwe  2020   
1358     Zimbabwe  2023   
1359     Zimbabwe  2025   

      All_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate  \
0                                                  36.9                      
1                                                  32.7                      
2                                                  29.1                      
3                                                  26.0                      
4                                                  23.8                      
...                                                 ...                      
1355                                               12.1                      
1356                                               