In [1]:
import pandas as pd
import numpy as np
from typing import Optional, List
import matplotlib.pyplot as plt
import statsmodels.api as s
from statsmodels.tsa.stattools import grangercausalitytests
import pandas as pd
import statsmodels.api as sm

In [2]:
df = pd.read_excel("/Users/wei/UCD-MPH/MPH-Lecture:Modules/MPH Dissertation/Data/19_ratified_country.xlsx")

In [3]:
def plot_relationship(df: pd.DataFrame,
                      select_country: Optional[List[str]] = None,
                      variable_1: Optional[str] = None,
                      variable_2: Optional[str] = None,
                      variable_3: Optional[str] = None,
                      variable_4: Optional[str] = None,
                      x_label: Optional[str] = None,
                      y_label: Optional[str] = None,
                      save_path: Optional[str] = None) -> None:
    # convert 'Year' column to string type
    df['Year'] = df['Year'].astype(str)
    # filter by selected countries
    if select_country is not None:
        for country in select_country:
            country_df = df[df['Country Name'] == country]
            # create a new figure with 2 subplots
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(8, 8))
            # plot variable_1 and variable_2 on the first subplot
            ax1.plot(country_df['Year'], country_df[variable_1], color='b')
            ax1.plot(country_df['Year'], country_df[variable_2], color='r')
            ax1.set_xlabel(x_label or 'Year')
            ax1.set_ylabel('Prevalence of Tobacco Use & CVD Mortality', color='k')
            ax1.tick_params(axis='y', labelcolor='k')
            ax1.legend(['Prevalence of Tobacco Use in Males (%)', 'CVD Mortality in Males (%)'], loc='upper left')
            # plot variable_3 and variable_4 on the second subplot
            ax2.plot(country_df['Year'], country_df[variable_3], color='g')
            ax2.plot(country_df['Year'], country_df[variable_4], color='m')
            ax2.set_xlabel(x_label or 'Year')
            ax2.set_ylabel('Prevalence of Tobacco Use & CVD Mortality', color='k')
            ax2.tick_params(axis='y', labelcolor='k')
            ax2.legend(['Prevalence of Tobacco Use in Females (%)', 'CVD Mortality in Females (%)'], loc='upper left')
            # set titles and axis labels for the figure
            plt.suptitle(f'{country} Statistics')
            plt.xticks(country_df['Year'])
            
            if y_label is not None:
                fig.text(0.06, 0.5, y_label, va='center', rotation='vertical')
            if save_path is not None:
                if not os.path.exists(os.path.dirname(save_path)):
                    os.makedirs(os.path.dirname(save_path))
                fig.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()


In [None]:
select_country = df['Country Name'].unique()
variable_1 = 'Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'
variable_2 = 'Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'
variable_3 = 'Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'
variable_4 = 'Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'
x_label = 'Year'


plot_relationship(df,select_country = select_country,
                  variable_1 = variable_1,
                  variable_2 = variable_2,
                  variable_3 = variable_3,
                  variable_4 = variable_4,
                  x_label =x_label)


In [4]:
import pandas as pd
from scipy.stats import pearsonr

def evaluate_correlation(df):
    results = {}
    df['Year'] = df['Year'].astype(str)
    df['Ratified Year'] = df['Ratified Year'].astype(str)
    
    for country, country_data in df.groupby(['Country Name']):
        # Find the year before and after FCTC ratification
        before_year = df['Year'] < df['Ratified Year']
        after_year = df['Year'] > df['Ratified Year']
        
        # Calculate the correlation between smoking rates and CVD mortality before FCTC ratification
        before_data = df[before_year]
        before_corr_F, _ = pearsonr(before_data['Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                  before_data['Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        before_corr_M, _ = pearsonr(before_data['Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                  before_data['Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        # Calculate the correlation between smoking rates and CVD mortality after FCTC ratification
        after_data = df[after_year]
        after_corr_F, _ = pearsonr(after_data['Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                 after_data['Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        after_corr_M, _ = pearsonr(after_data['Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                 after_data['Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        # Store the results for the current country
        results[country] = {'before_corr_F': before_corr_F, 'after_corr_F': after_corr_F,
                            'before_corr_M': before_corr_M, 'after_corr_M': after_corr_M}
    return pd.DataFrame.from_dict(results, orient='index')


In [5]:
evaluate_correlation(df )

Unnamed: 0,before_corr_F,after_corr_F,before_corr_M,after_corr_M
Austria,0.317118,0.635634,0.761732,0.691563
Costa Rica,0.317118,0.635634,0.761732,0.691563
Czechia,0.317118,0.635634,0.761732,0.691563
Ecuador,0.317118,0.635634,0.761732,0.691563
Estonia,0.317118,0.635634,0.761732,0.691563
Georgia,0.317118,0.635634,0.761732,0.691563
Germany,0.317118,0.635634,0.761732,0.691563
Guatemala,0.317118,0.635634,0.761732,0.691563
Iceland,0.317118,0.635634,0.761732,0.691563
Kazakhstan,0.317118,0.635634,0.761732,0.691563


In [None]:
def interrupted_time_series_analysis(df):
    # 設定分析的變數
    male_smoking_pre = 'Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'
    male_cvd_pre = 'Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'
    female_smoking_pre = 'Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'
    female_cvd_pre = 'Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'

    # 建立一個空的DataFrame儲存結果
    results = pd.DataFrame(columns=['Country Name', 'Variable', 'Beta', 'P-value'])

    # 對每個國家進行分析
    for country in df['Country Name'].unique():
        # 篩選出目前分析的國家
        country_df = df[df['Country Name'] == country]

        # 將資料依照 Ratification 年份排序
        country_df = country_df.sort_values(by='Ratification')

        # 設定控制變數為 Ratification 年份以外的所有年份
        X_pre = sm.add_constant(country_df[country_df['Ratification'] != 1][[male_smoking_pre, male_cvd_pre, female_smoking_pre, female_cvd_pre]])
        y_pre = country_df[country_df['Ratification'] != 1]['Yearly_CVD_Death_Rate']

        # 設定實驗變數為 Ratification 年份
        X_post = sm.add_constant(country_df[country_df['Ratification'] == 1][[male_smoking_pre, male_cvd_pre, female_smoking_pre, female_cvd_pre]])
        y_post = country_df[country_df['Ratification'] == 1]['Yearly_CVD_Death_Rate']

        # 進行 interrupted time series analysis，計算出實驗前後的係數
        model_pre = sm.OLS(y_pre, X_pre).fit()
        model_post = sm.OLS(y_post, X_post).fit()
        beta_pre = model_pre.params
        beta_post = model_post.params

        # 計算實驗前後的差異
        diff = beta_post - beta_pre

        # 將結果加入到結果DataFrame中
        for variable, beta in diff.items():
            p_value = model_post.pvalues[variable]
            results = results.append({'Country Name': country, 'Variable': variable, 'Beta': beta, 'P-value': p_value}, ignore_index=True)

    return results


In [None]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt


In [None]:
interrupted_time_series_analysis(df)

In [None]:
import pandas as pd
from scipy.stats import pearsonr

def evaluate_correlation(df):
    results = {}
    df['Year'] = df['Year'].astype(str)
    df['Ratified Year'] = df['Ratified Year'].astype(str)
    for country in df['Country Name'].unique():
        # Subset data for the current country
        country_df = df[df['Country Name'] == country]
        
        # Find the year before and after FCTC ratification
        
        before_year = country_df['Year'] < country_df['Ratified Year']
        after_year = country_df['Year'] > country_df['Ratified Year']
        
        # Calculate the correlation between smoking rates and CVD mortality before FCTC ratification
        before_data = country_df[country_df['Year'] == before_year]
        before_corr_F, _ = pearsonr(before_data['Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                  before_data['Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        before_corr_M, _ = pearsonr(before_data['Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                  before_data['Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        # Calculate the correlation between smoking rates and CVD mortality after FCTC ratification
        after_data = country_data[country_df['Year'] == after_year]
        after_corr_F, _ = pearsonr(after_data['Female_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                 after_data['Female_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        after_corr_M, _ = pearsonr(after_data['Male_Total_Percentage_of_Cause_Specific_Deaths_Out_Of_Total_Deaths'],
                                 after_data['Male_Estimate_of_Current_Tobacco_Use_Prevalence_age_standardized_rate'])
        
        # Store the results for the current country
        results[country] = {'before_corr_F': before_corr_F, 'after_corr_F': after_corr_F,
                            'before_corr_M': before_corr_M, 'after_corr_M': after_corr_M}
        
    return pd.DataFrame.from_dict(results, orient='index')
