In [1]:
from pathlib import Path  # pathlib: module, Path: class. Checking if a path exist
from typing import Optional, List, Dict, Tuple  # typing: support for type hint
import pandas as pd
import numpy as np
import collections  # This module contains different datatype to process the data: dict, list, set, and tuple.
from pprint import pprint  # pprint.pprint() can use when you need to examine the structure of a large or complex
# data structure. this output reveals more readable and structured way.
from who_member_states import WHO_MEMBER_STATES

In [2]:

__all__ = ['select_df']  # only import 'select_df'


# test data should <10 MB

def select_df(df: pd.DataFrame,
              rename_mapping: Dict[str, str] = None,
              column_drop: Optional[List[str]] = None,  # column_drop (param) is an optional list of string. Optional
              # type is from Typing. default is None (no drop any column). If provide column_drop, must be ['xxx'] (
              # list of string)
              year: int = 2000,
              save_path: Optional[Path] = None,
              drop_na: Optional[List[str]] = None, ) -> pd.DataFrame:
    """
    dataframe modification and save as another file

    :param df: input dataframe
    :param rename_mapping: both df have countries but the header is different
    :param column_drop: drop the column(s) that are not informative
    :param year: pick up the data that larger than which year
    :param save_path: path for saving the modified dataframe
    :param drop_na: specify if drop the na-existing column name
    :return: modified df
    """

    df = df.copy()  # The copy() method returns a copy of the DataFrame. By default, the copy is a "deep copy"
    # meaning that any changes  made in the original DataFrame will NOT be reflected in the copy. 新跑出來的df不會影響最原始的df

    if rename_mapping is not None:
        df = df.rename(columns=rename_mapping)
    if column_drop is not None:  # if drop specific column,df need to drop the option
        df = df.drop(columns=column_drop)  # df 的column = column after drop, column_drop: Optional[List[str]]

    year_mask: pd.Series[bool] = df['Year'] >= year  # type is list of bool; compared df["year"] whether larger than
    # default year (year: int = 2000)
    entity_mask: List[bool] = [country in WHO_MEMBER_STATES for country in df['Country Name']]  # check countries whether
    # in df['Entity'] also in WHO_MEMBER_STATES. If Yes =True
    modified_df: pd.DataFrame = df[year_mask & entity_mask].reset_index(
        drop=True)  # create a new df that only meet both
    # year_mask and entity_mask. df.reset_index(drop = True) means new index created, old index don't added in new df.

    if drop_na is not None:  # drop rows with missing values ('NaN') from df
        try:
            modified_df.dropna(subset=drop_na, inplace=True)
            # inplace = True means that the original df will be modified and no copy will be made.; But, if inplace =
            # False, df will still show the initial one. subset = drop_na means drop in specific place you set.
        except KeyError as e:
            raise ValueError(f'{e} not in the dataframe, should be one of the {modified_df.columns.tolist()}')  # If
            # typed wrong, show the list which should be dropped.
    if save_path is not None:
        modified_df.to_excel(save_path)

    return modified_df


In [3]:
if __name__ == '__main__':
    raw_who_cvd_df = pd.read_csv('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/raw '
                                 'data/WHOMortalityDatabase_Deaths_sex_age_a_country_area_year-Cardiovascular '
                                 'diseases_7th February 2023.csv')
column_drop = ['Age group code', 'Unnamed: 12']
na_header = ['Number',
             'Percentage of cause-specific deaths out of total deaths',
             'Death rate per 100 000 population']
save_path = (
    '/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHO_Cardiovascular_Disease_Mortality_Database.xlsx')
who_cvd_df = select_df(raw_who_cvd_df, column_drop=column_drop, drop_na=na_header,
                       save_path=save_path)

In [4]:
number_of_countries=raw_who_cvd_df['Country Name'].unique()
print(len(number_of_countries),raw_who_cvd_df.shape, raw_who_cvd_df.columns.tolist())
print(len(who_cvd_df['Country Name'].unique()),who_cvd_df.shape,who_cvd_df.columns.tolist())


114 (297066, 13) ['Region Code', 'Region Name', 'Country Code', 'Country Name', 'Year', 'Sex', 'Age group code', 'Age Group', 'Number', 'Percentage of cause-specific deaths out of total deaths', 'Age-standardized death rate per 100 000 standard population', 'Death rate per 100 000 population', 'Unnamed: 12']
98 (98949, 11) ['Region Code', 'Region Name', 'Country Code', 'Country Name', 'Year', 'Sex', 'Age Group', 'Number', 'Percentage of cause-specific deaths out of total deaths', 'Age-standardized death rate per 100 000 standard population', 'Death rate per 100 000 population']


In [5]:
def preprocess_cvd(df: pd.DataFrame,
                   drop_na: Optional[List[str]] = None,
                   save_path: Optional[Path] = None) -> pd.DataFrame:
    """
    Dataframe of WHO_Cardiovascular_Disease_Mortality_Database.xlsx need to modify:
    - grouping if set kwarg `grouping_age` as true  # kwarg : keyword arguments
    - Calculate Total number of death
    - only left age >15 years old
    - save as another dataframe

    :param df: input dataframe (WHO_Cardiovascular_Disease_Mortality_Database.xlsx)
    :param drop_na: drop Age_Group <15 year-old
    :param save_path: save modified dataframe to another excel
    :return: df
    """
    
    # Sum of number of death in each age group
    numbers = df['Number']
    percentage = df['Percentage of cause-specific deaths out of total deaths']
    df["Total Number of Cause-Specific Deaths"] = numbers * 100 / percentage
    total_number_of_death = df["Total Number of Cause-Specific Deaths"]
    mask_nan = np.isnan(total_number_of_death)  # type: pd.Series[bool] # if value is NaN, NaN = True

    df.loc[mask_nan, "Total Number of Cause-Specific Deaths"] = 0  # search location of df. if index ('Total number of death') is
    # NaN, change NaN to 0. ( if no.loc :SettingWithCopyWarning: A value is trying to be set on a copy of a slice
    # from a DataFrame
    df["Total Number of Cause-Specific Deaths"] = df["Total Number of Cause-Specific Deaths"].astype(int)  # astype can cast/change multiple types (
    # change type to int)
    if drop_na is not None:  # drop rows with missing values ('NaN') from df
        df = df.mask(df['Age Group'].isin(['[0]', '[1-4]', '[5-9]', '[10-14]']), np.nan)
        df.dropna(subset=['Age Group'], inplace=True)
    
    if save_path is not None:
        df.to_excel(save_path)
    return df


In [6]:
who_cvd_df= pd.read_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHO_Cardiovascular_Disease_Mortality_Database.xlsx')
# “xlrd” supports old-style Excel files (.xls).“openpyxl” supports newer Excel file formats.
drop_na =['Age Group']
save_path=('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHO_Cardiovascular_Disease_Mortality_Database_preprocess.xlsx')
who_cvd_df_preprocess = preprocess_cvd(who_cvd_df,drop_na= drop_na,save_path=save_path)  # assign a who_cvd_df after preprocess_cvd
print(len(who_cvd_df_preprocess['Country Name'].unique()),who_cvd_df_preprocess.shape,who_cvd_df_preprocess.columns.tolist())

98 (79239, 13) ['Unnamed: 0', 'Region Code', 'Region Name', 'Country Code', 'Country Name', 'Year', 'Sex', 'Age Group', 'Number', 'Percentage of cause-specific deaths out of total deaths', 'Age-standardized death rate per 100 000 standard population', 'Death rate per 100 000 population', 'Total Number of Cause-Specific Deaths']


In [7]:
def create_age_grouping(df: pd.DataFrame,
                        save_path: Optional[Path] = None) -> pd.DataFrame:
    """
    Calculate: Total percentage of CVD of total deaths = Sum of number/ Sum of Total number of cause-specific deaths
    * 100 (Male/ Female/ All in each year and country)
    grouping_age: Age groups --> one age group (greater 15 y/o)
    create a new df and save it to excel

    :param df: df after select_df and preprocess_cvd
    :param save: save modified dataframe to another excel
    :return: new df
    """

    if 'Total Number of Cause-Specific Deaths' not in df.columns:
        raise RuntimeError('call preprocess_cvd in advance')

    dy: dict = collections.defaultdict(list)  # defaultdict object in collections. datatype will be dict. Using list
    # as the default_factory to group a sequence of key-value pairs into a dictionary of lists
    group = df.groupby(['Region Code','Region Name','Country Code','Country Name', 'Year', 'Sex'])
    info: List[Tuple] = list(group.groups.keys())  # List[Tuple]: value is a list of tuple[()].looking for the keys in a
    # dict. The 'groups' attribute of the 'groupby' object is always dic type

    for i, it in enumerate(info):  # i = index ( starting from 0) , it = item (Entity, Year, Sex). enumerate can pair
        # index and item
        dy['Region Code'].append(it[0])
        dy['Region Name'].append(it[1]) 
        dy['Country Code'].append(it[2]) 
        dy['Country Name'].append(it[3])  
        dy['Year'].append(it[4])  # Year in [4]
        dy['Sex'].append(it[5])  # Sex in [5]

    numbers = group['Number']
    total_number_of_death = group['Total Number of Cause-Specific Deaths']
    # noinspection PyTypeChecker todo: wt is it?
    dy['Number'] = np.array(numbers.sum())
    dy['Total Number of Cause-Specific Deaths'] = np.array(total_number_of_death .sum())
    dy['Total Percentage of Cause-Specific Deaths Out Of Total Deaths'] = np.array(numbers.sum() / total_number_of_death.sum() * 100)

    _df = pd.DataFrame.from_dict(dy)  # creates a new_df from the dy dictionary.

    # change layout
    sex_values = ['All', 'Female', 'Male']
    new_df = _df.assign(
        All_Number=_df.query("Sex == 'All'")['Number'],
        Female_Number=_df.query("Sex == 'Female'")['Number'],
        Male_Number=_df.query("Sex == 'Male'")['Number'],
        All_Total_Number_of_Cause_Specific_Deaths=_df.query("Sex == 'All'")['Total Number of Cause-Specific Deaths'],
        Female_Total_Number_of_Cause_Specific_Deaths=_df.query("Sex == 'Female'")['Total Number of Cause-Specific Deaths'],
        Male_Total_Number_of_Cause_Specific_Deaths=_df.query("Sex == 'Male'")['Total Number of Cause-Specific Deaths'],
        All_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths=_df.query("Sex == 'All'")['Total Percentage of Cause-Specific Deaths Out Of Total Deaths'],
        Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths=_df.query("Sex == 'Female'")['Total Percentage of Cause-Specific Deaths Out Of Total Deaths'],
        Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths=_df.query("Sex == 'Male'")['Total Percentage of Cause-Specific Deaths Out Of Total Deaths'])
    new_df.reset_index(drop=True, inplace=True)

    new_df = new_df.drop(['Sex', 'Number', 'Total Number of Cause-Specific Deaths', 'Total Percentage of Cause-Specific Deaths Out Of Total Deaths'],
                         axis=1)  # axis = 1: specifies to drop columns

    new_df = new_df.groupby(['Region Code','Region Name','Country Code','Country Name', 'Year']).first().reset_index()  # The first method is then applied to
    # the grouped dataframe, which returns the first row of each group
    if save_path:
        new_df.to_excel(save_path)
    return new_df


In [8]:
who_cvd_df_preprocess = pd.read_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHO_Cardiovascular_Disease_Mortality_Database_preprocess.xlsx')
who_cvd_df_preprocess= preprocess_cvd(who_cvd_df_preprocess)
save_path = ('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/new_WHO_Cardiovascular_Disease_Mortality_Database.xlsx')
new_df = create_age_grouping(who_cvd_df_preprocess,save_path=save_path)
print(len(new_df['Country Name'].unique()),new_df.shape,new_df.columns.tolist())

98 (1670, 14) ['Region Code', 'Region Name', 'Country Code', 'Country Name', 'Year', 'All_Number', 'Female_Number', 'Male_Number', 'All_Total_Number_of_Cause_Specific_Deaths', 'Female_Total_Number_of_Cause_Specific_Deaths', 'Male_Total_Number_of_Cause_Specific_Deaths', 'All_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths', 'Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths', 'Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths']


In [17]:
raw_tobacco_df = pd.read_csv(
        "/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/raw "
        "data/Prevalence_of_current_tobacco_use_between_Males_and_Females.csv")
number_of_countries=raw_tobacco_df['Entity'].unique()
print(len(number_of_countries),raw_tobacco_df.shape,raw_tobacco_df.columns.tolist())

310 (57051, 7) ['Entity', 'Code', 'Year', 'Prevalence of current tobacco use, males (% of male adults)', 'Prevalence of current tobacco use, females (% of female adults)', 'Population (historical estimates)', 'Continent']


In [26]:
rename= {'Entity':'Country Name','Code':'Country Code'}
na_header = ['Prevalence of current tobacco use, males (% of male adults)',
             'Prevalence of current tobacco use, females (% of female adults)']
col_drop=['Continent']
save_path = ('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/Prevalence of Tobacco use.xlsx')
tobacco_df = select_df(raw_tobacco_df,rename_mapping = rename,column_drop=col_drop, drop_na=na_header,save_path=save_path)
print(len(tobacco_df['Country Name'].unique()),tobacco_df.shape,tobacco_df.columns.tolist())

159 (1113, 6) ['Country Name', 'Country Code', 'Year', 'Prevalence of current tobacco use, males (% of male adults)', 'Prevalence of current tobacco use, females (% of female adults)', 'Population (historical estimates)']


rename = {'Prevalence of current tobacco use, males (% of male adults)':'Prevalence of Current Tobacco Use in Males (%)'
         ,'Prevalence of current tobacco use, females (% of female adults)':'Prevalence of Current Tobacco Use in Females (%)'}
tobacco_df = select_df(tobacco_df,rename_mapping= rename)



print(len(tobacco_df['Entity'].unique()),tobacco_df.shape,tobacco_df.columns.tolist())


In [27]:
# merge df1 & df2 test
df1 = new_df
df2 = tobacco_df

cvd_tobacco = pd.merge(df1, df2, on=['Country Name', 'Year'], how='outer')
cvd_tobacco.fillna(value='NaN', inplace=True)
cvd_tobacco.to_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/merge_cvd_tobacco.xlsx')
print(len(cvd_tobacco['Country Name'].unique()),cvd_tobacco.shape, cvd_tobacco.columns.tolist())


171 (2345, 18) ['Region Code', 'Region Name', 'Country Code_x', 'Country Name', 'Year', 'All_Number', 'Female_Number', 'Male_Number', 'All_Total_Number_of_Cause_Specific_Deaths', 'Female_Total_Number_of_Cause_Specific_Deaths', 'Male_Total_Number_of_Cause_Specific_Deaths', 'All_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths', 'Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths', 'Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths', 'Country Code_y', 'Prevalence of current tobacco use, males (% of male adults)', 'Prevalence of current tobacco use, females (% of female adults)', 'Population (historical estimates)']


In [29]:
#drop_missing_data
df = pd.read_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/merge_cvd_tobacco.xlsx')
df = df.dropna(how='any')  # drop the rows having Nan
df.to_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/merge_cvd_tobacco_no_missingdata.xlsx')
countries = df['Country Name'].unique()
number_of_countries = len(countries)
print(number_of_countries, countries, df.shape)  # 85 countries


85 ['Egypt' 'Mauritius' 'Seychelles' 'South Africa' 'Armenia' 'Azerbaijan'
 'Bahrain' 'Cyprus' 'Georgia' 'Iraq' 'Israel' 'Jordan' 'Japan'
 'Kazakhstan' 'Kyrgyzstan' 'Kuwait' 'Lebanon' 'Sri Lanka' 'Maldives'
 'Philippines' 'Singapore' 'Thailand' 'Turkmenistan' 'Turkey' 'Uzbekistan'
 'Argentina' 'Belize' 'Brazil' 'Chile' 'Colombia' 'Costa Rica' 'Ecuador'
 'Guatemala' 'Guyana' 'Mexico' 'Panama' 'Peru' 'Paraguay' 'El Salvador'
 'Uruguay' 'Albania' 'Austria' 'Belgium' 'Bulgaria'
 'Bosnia and Herzegovina' 'Belarus' 'Switzerland' 'Czechia' 'Germany'
 'Denmark' 'Spain' 'Estonia' 'Finland' 'France' 'United Kingdom' 'Greece'
 'Croatia' 'Hungary' 'Ireland' 'Iceland' 'Italy' 'Lithuania' 'Luxembourg'
 'Latvia' 'Malta' 'Montenegro' 'Netherlands' 'Norway' 'Poland' 'Portugal'
 'Romania' 'Serbia' 'Slovakia' 'Slovenia' 'Sweden' 'Ukraine' 'Bahamas'
 'Barbados' 'Canada' 'Cuba' 'Dominican Republic' 'Jamaica' 'Australia'
 'Fiji' 'New Zealand'] (438, 19)


In [30]:
def format_date(df: pd.DataFrame,
                rename_mapping: Dict[str, str] = None,
                uniform_date: Optional[List[str]] = None,
                save_path: Optional[Path] = None) -> pd.DataFrame:
    if rename_mapping is not None:
        df = df.rename(columns=rename_mapping)
    if uniform_date is not None:
        for col in uniform_date:
            df[col] = pd.to_datetime(df[col], infer_datetime_format=True)
            df[col] = df[col].dt.strftime('%d/%m/%Y')
    df.fillna(value='Nan', inplace=True)
    if save_path is not None:
        df.to_excel(save_path)
    return df

In [32]:
df = pd.read_excel('/Users/wei/Python/MPHDissertation/test_file/WHO_FCTC_Parties_date_filter .xlsx')
rename = {"Ratification, Acceptance(A), Approval(AA), Formal confirmation(c), Accession(a), Succession(d)": 'Ratification'}
uniform_date = ['Signature', 'Ratification']
save_path = '/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHOFCTC_Parties_date_filter.xlsx'
format_date(df, rename_mapping=rename, uniform_date=uniform_date,save_path=save_path)

Unnamed: 0,Participant,Signature,Ratification
0,Albania,29/06/2004,26/04/2006
1,Argentina,25/09/2003,Nan
2,Armenia,Nan,29/11/2004
3,Australia,05/12/2003,27/10/2004
4,Austria,28/08/2003,15/09/2005
...,...,...,...
80,Turkmenistan,Nan,13/05/2011
81,Ukraine,25/06/2004,06/06/2006
82,United Kingdom of Great Britain and Northern I...,16/06/2003,16/12/2004
83,Uruguay,19/06/2003,09/09/2004


In [36]:
def ratified_parties(df: pd.DataFrame,
                     drop_na: None,
                     save_path: Optional[Path] = None) -> pd.DataFrame:
    if drop_na is not None:
        df = df.mask(df['Country Name'].isin(['Argentina', 'Cuba', 'Switzerland']), np.nan)
        df.dropna(subset=['Country Name'], inplace=True)
        df = df.dropna(how='any')
        
    if save_path is not None:
        df.to_excel(save_path)
    return df

In [37]:
df1 = pd.read_excel('/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/merge_cvd_tobacco_no_missingdata.xlsx')
drop_na = True
save_path = '/Users/wei/Python/MPHDissertation/src/test_ratifies.xlsx'
ratified_parties(df1, drop_na=drop_na, save_path=save_path)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Region Code,Region Name,Country Code_x,Country Name,Year,All_Number,Female_Number,Male_Number,All_Total_Number_of_Cause_Specific_Deaths,Female_Total_Number_of_Cause_Specific_Deaths,Male_Total_Number_of_Cause_Specific_Deaths,All_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths,Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths,Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths,Country Code_y,"Prevalence of current tobacco use, males (% of male adults)","Prevalence of current tobacco use, females (% of female adults)",Population (historical estimates)
0,1.0,1.0,AF,Africa,EGY,Egypt,2000.0,282581.0,129412.0,153169.0,696693.0,314021.0,382662.0,40.560333,41.211257,40.027230,EGY,47.500000,1.0,71371368.0
1,6.0,6.0,AF,Africa,EGY,Egypt,2005.0,376768.0,176597.0,200171.0,805718.0,364999.0,440716.0,46.761770,48.382872,45.419499,EGY,47.400002,0.8,79075312.0
2,11.0,11.0,AF,Africa,EGY,Egypt,2010.0,433181.0,203817.0,229364.0,923607.0,417768.0,505835.0,46.901009,48.787126,45.343640,EGY,47.900002,0.7,87252416.0
3,16.0,16.0,AF,Africa,EGY,Egypt,2015.0,526344.0,246653.0,279691.0,1084186.0,492163.0,592011.0,48.547389,50.116120,47.244224,EGY,48.099998,0.5,97723792.0
4,19.0,19.0,AF,Africa,EGY,Egypt,2018.0,566228.0,263042.0,303186.0,1064207.0,481643.0,582562.0,53.206566,54.613479,52.043559,EGY,48.200001,0.5,103740768.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
433,1646.0,1646.0,OA,Oceania,FJI,Fiji,2005.0,4681.0,1815.0,2866.0,11309.0,4905.0,6393.0,41.391812,37.003058,44.830283,FJI,45.500000,13.8,874929.0
434,1653.0,1653.0,OA,Oceania,NZL,New Zealand,2000.0,21832.0,11138.0,10694.0,52869.0,25565.0,27291.0,41.294520,43.567377,39.185079,NZL,30.000000,29.1,3855266.0
435,1658.0,1658.0,OA,Oceania,NZL,New Zealand,2005.0,21002.0,11178.0,9824.0,53826.0,27094.0,26724.0,39.018318,41.256367,36.760964,NZL,25.100000,23.5,4132782.0
436,1663.0,1663.0,OA,Oceania,NZL,New Zealand,2010.0,20337.0,10587.0,9750.0,56800.0,28394.0,28398.0,35.804577,37.286046,34.333404,NZL,21.299999,18.9,4346345.0


In [38]:

import matplotlib.pyplot as plt

WHOFCTC_Parties_date = pd.read_excel("/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/WHOFCTC_Parties_date_filter.xlsx")
CVD_Tobacco_ratified_parites = pd.read_excel("/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/CVD_Tobacco_Parties_ratified.xlsx")
WHOFCTC_Parties_date = WHOFCTC_Parties_date[WHOFCTC_Parties_date['Ratification'] != 'Nan']
WHOFCTC_Parties_date['Ratification'] = pd.to_datetime(WHOFCTC_Parties_date['Ratification'], infer_datetime_format='%d/%m/%Y')
WHOFCTC_Parties_date['Year'] = WHOFCTC_Parties_date['Ratification'].dt.year
WHOFCTC_Parties_date

Unnamed: 0.1,Unnamed: 0,Participant,Signature,Ratification,Year
0,0,Albania,29/06/2004,2006-04-26,2006
2,2,Armenia,Nan,2004-11-29,2004
3,3,Australia,05/12/2003,2004-10-27,2004
4,4,Austria,28/08/2003,2005-09-15,2005
5,5,Azerbaijan,Nan,2005-11-01,2005
...,...,...,...,...,...
80,80,Turkmenistan,Nan,2011-05-13,2011
81,81,Ukraine,25/06/2004,2006-06-06,2006
82,82,United Kingdom of Great Britain and Northern I...,16/06/2003,2004-12-16,2004
83,83,Uruguay,19/06/2003,2004-09-09,2004


In [39]:
CVD_Tobacco_ratified_parites = CVD_Tobacco_ratified_parites[CVD_Tobacco_ratified_parites['Country'] == 'Australia']
years = CVD_Tobacco_ratified_parites['Year'].astype(str)
plt.plot(years, CVD_Tobacco_ratified_parites['Male_total_percentage_of_CVD'], label='Total Percentage of CVD in Males')
plt.plot(years, CVD_Tobacco_ratified_parites['Prevalence of current tobacco use, males (% of male adults)'], label='Prevalence of Current Tobacco Use in Males')
plt.xlabel('Year')
plt.ylabel('Percentage/Prevalence')
plt.legend()
plt.xticks(years, rotation=90)
plt.title('CVD and Tobacco Use in Australia')
plt.show()


KeyError: 'Country'

In [None]:

def plot_relationship(df: pd.DataFrame,
                      select_country: Optional[List[str]] = None,
                      variable_1: Optional[List[str]] = None,
                      variable_2: Optional[List[str]] = None,
                      x_label: str = None,
                      y_label: str = None,
                      save_path: Optional[Path] = None) -> pd.DataFrame:
    df['Year'] = df['Year'].astype(str)
    if select_country is not None:
        for country in select_country:
            df = df[df['Entity'] == country]
        #df = df[df['Entity'].isin(select_country)]
            if variable_1 is not None:
                plt.plot(df['Year'], df[variable_1[0]], label=variable_1[0])
            if variable_2 is not None:
                plt.plot(df['Year'], df[variable_2[0]], label=variable_2[0])
            if x_label is not None:
                plt.xlabel(x_label)
            if y_label is not None:
                plt.ylabel(y_label)
            plt.legend()
            plt.xticks(df['Year'], rotation=90)

            plt.title(f'CVD Mortality and Prevalence of Tobacco Use in {select_country[0]}')
            plt.show()

    if save_path is not None:
        df.to_excel(save_path)
    return df



In [None]:
def plot_relationship(df: pd.DataFrame,
                      select_country: Optional[List[str]] = None,
                      variable_1: Optional[List[str]] = None,
                      variable_2: Optional[List[str]] = None,
                      x_label: str = None,
                      y_label: str = None,
                      save_path: Optional[Path] = None) -> pd.DataFrame:
    df['Year'] = df['Year'].astype(str)
    if select_country is not None:
        for country in select_country:
            country_df = df[df['Entity'] == country]
            if variable_1 is not None:
                plt.plot(country_df['Year'], country_df[variable_1[0]], label=variable_1[0])
            if variable_2 is not None:
                plt.plot(country_df['Year'], country_df[variable_2[0]], label=variable_2[0])
            if x_label is not None:
                plt.xlabel(x_label)
            if y_label is not None:
                plt.ylabel(y_label)
            plt.legend()
            plt.xticks(country_df['Year'], rotation=90)

            plt.title(f'CVD Mortality and Prevalence of Tobacco Use in {country}')
            plt.show()

    if save_path is not None:
        df.to_excel(save_path)
    return df
if __name__ == '__main__':
    df= pd.read_excel(
        "/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/CVD_Tobacco_Parties_ratified.xlsx")
    select_country =df['Entity'].unique()
    variable_1 = ['Female_total_percentage_of_CVD']
    variable_2 = ['Prevalence of current tobacco use, females (% of female adults)']
    x_label = 'Year'
    y_label = 'Total Percentage of CVD Mortality & Prevalence of Tobacco Use'
    for country in select_country:
        df = plot_relationship(df, select_country=[country], variable_1=variable_1, variable_2=variable_2,
                           x_label=x_label, y_label=y_label)
