In [2]:
import pandas as pd

import plotly
import plotly.graph_objs as go
import plotly.express as px

In [3]:
data_dir = 'Datasets/'
ww_use_df = pd.read_csv(data_dir + "tobacco_use_ww.csv")
stop_smoking_df = pd.read_csv(data_dir + "stop_smoking.csv")
death_df = pd.read_csv(data_dir + "death_rates_smoking_age.csv")

## Worldwide tobacco use

In [4]:
ww_use_df

Unnamed: 0,ParentLocationCode,ParentLocation,SpatialDimValueCode,Location,Year,Gender,Value
0,SEAR,South-East Asia,PRK,Democratic People's Republic of Korea,2018,Female,0.0
1,EUR,Europe,AZE,Azerbaijan,2018,Female,0.2
2,AFR,Africa,ERI,Eritrea,2018,Female,0.3
3,EMR,Eastern Mediterranean,EGY,Egypt,2018,Female,0.4
4,AFR,Africa,GHA,Ghana,2018,Female,0.4
...,...,...,...,...,...,...,...
4018,WPR,Western Pacific,TUV,Tuvalu,2000,Male,84.6
4019,AMR,Americas,JAM,Jamaica,2000,Female,9.1
4020,AFR,Africa,STP,Sao Tome and Principe,2000,Male,9.2
4021,EMR,Eastern Mediterranean,OMN,Oman,2000,Both sexes,9.5


In [20]:
def plot_country_use(country_name):
    country_df = ww_use_df[ww_use_df["Location"] == country_name]
    fig = px.line(country_df.query("Gender == 'Both sexes'"), x="Year", y="Value", markers=True, title=country_name)
    fig.show()


In [21]:
plot_country_use("Canada")

## Regulations

Notes

- We should compare cigarette tax with baseline taxes

In [33]:
stop_smoking_df

Unnamed: 0,Entity,Code,Year,AvgCigarettePriceDollars,AvgTaxesAsPctCigarettePrice,EnforceBansTobaccoAd,HelpToQuit
0,Algeria,DZA,2012,1.84,47.0,4,3
1,Algeria,DZA,2014,2.09,40.9,4,4
2,Argentina,ARG,2012,2.79,69.9,4,4
3,Argentina,ARG,2014,3.47,69.7,4,5
4,Armenia,ARM,2012,1.69,38.4,2,4
...,...,...,...,...,...,...,...
769,Yemen,YEM,2010,,,4,3
770,Zambia,ZMB,2007,,,2,3
771,Zambia,ZMB,2010,,,2,3
772,Zimbabwe,ZWE,2007,,,2,3


In [35]:
stop_smoking_df["Year"].unique()

array([2012, 2014, 2007, 2010])

In [6]:
stop_smoking_df.describe()

Unnamed: 0,Year,AvgCigarettePriceDollars,AvgTaxesAsPctCigarettePrice,EnforceBansTobaccoAd,HelpToQuit
count,774.0,208.0,209.0,774.0,774.0
mean,2010.755814,4.336394,57.339234,3.313953,3.49354
std,2.587373,2.534659,20.403278,1.088995,0.807042
min,2007.0,0.0,0.0,2.0,1.0
25%,2010.0,2.195,42.9,2.0,3.0
50%,2012.0,4.155,62.4,4.0,4.0
75%,2013.5,5.7675,75.2,4.0,4.0
max,2014.0,13.0,86.4,5.0,5.0


In [13]:
stop_smoking_df["HelpSize"] = (stop_smoking_df["HelpToQuit"] / 5.) ** 2

In [14]:
px.scatter(stop_smoking_df, x="AvgTaxesAsPctCigarettePrice", y="EnforceBansTobaccoAd", size="HelpSize", hover_name="Entity")

## Deaths

Notes
- China surprisingly low? underreported?

In [17]:
death_df.head()

Unnamed: 0,Entity,Code,Year,All_ages,Under_5,5_14,50_69,15_49,70_plus
0,Afghanistan,AFG,1990,63.895905,,,267.230009,16.589519,679.006755
1,Afghanistan,AFG,1991,61.846347,,,266.975516,15.456913,677.617648
2,Afghanistan,AFG,1992,53.436511,,,266.430053,12.767999,679.50581
3,Afghanistan,AFG,1993,47.044347,,,267.969428,11.000425,683.973588
4,Afghanistan,AFG,1994,45.799808,,,272.403687,10.73802,691.007773


In [29]:
death_df["Year"].min(), death_df["Year"].max()

(1990, 2017)

In [32]:
len(death_df["Entity"].unique())

231

In [33]:
countries = ["Canada", "China", "United States", "France", "Greece"]

In [34]:
px.line(death_df[death_df["Entity"].isin(countries)], x="Year", y="All_ages", color="Entity")

## Effect of regulation on tobacco usage (no baseline)

Notes:
- Use 2000 as baseline?

In [87]:
def merge_reg_use(reg_df, use_df, year_delay=1):
    reg_df = reg_df.copy()
    use_df = use_df.copy()

    use_df = use_df.rename(columns={"SpatialDimValueCode": "Code", "Value": "Pct_Smoking"}).query("Gender == 'Both sexes'")
    reg_df["EffectiveYear"] = reg_df["Year"] + year_delay

    reg_use_df = pd.merge(reg_df, use_df, left_on=["Code", "EffectiveYear"], right_on=["Code", "Year"])
    return reg_use_df


In [88]:
reg_use_df = merge_reg_use(stop_smoking_df, ww_use_df, year_delay=3)
reg_use_df.head()

Unnamed: 0,Entity,Code,Year_x,AvgCigarettePriceDollars,AvgTaxesAsPctCigarettePrice,EnforceBansTobaccoAd,HelpToQuit,EffectiveYear,ParentLocationCode,ParentLocation,Location,Year_y,Gender,Pct_Smoking
0,Algeria,DZA,2012,1.84,47.0,4,3,2015,AFR,Africa,Algeria,2015,Both sexes,19.3
1,Algeria,DZA,2014,2.09,40.9,4,4,2017,AFR,Africa,Algeria,2017,Both sexes,18.9
2,Argentina,ARG,2012,2.79,69.9,4,4,2015,AMR,Americas,Argentina,2015,Both sexes,24.6
3,Argentina,ARG,2014,3.47,69.7,4,5,2017,AMR,Americas,Argentina,2017,Both sexes,22.7
4,Armenia,ARM,2012,1.69,38.4,2,4,2015,EUR,Europe,Armenia,2015,Both sexes,27.9


In [89]:
px.scatter(reg_use_df, x="AvgTaxesAsPctCigarettePrice", y="EnforceBansTobaccoAd", size="Pct_Smoking", hover_name="Entity")

Question: Do countries with higher taxes on cigarettes have lower percentage of smokers?

Graph: Percent of smokers in a country after an onset period vs taxes imposed on cigarettes.

In [99]:
px.scatter(reg_use_df, x="AvgTaxesAsPctCigarettePrice", y="Pct_Smoking", title="Percent Smoker vs. Taxes", trendline="ols")

Question: Do countries with stricter regulation on cigarette ads have a lower percentage of smokers?

Graph: Percent of smokers in a country after an onset period vs ban on cigarette ads.

In [100]:
ban_df = reg_use_df.groupby("EnforceBansTobaccoAd").mean()
ban_df["Smoking_Std"] = reg_use_df.groupby("EnforceBansTobaccoAd").std()["Pct_Smoking"]

px.bar(ban_df, x=ban_df.index, y="Pct_Smoking", title="Percent Smoker vs. Ads ban", error_y="Smoking_Std")

## Looking at consistency of policy across years

In [4]:
px.histogram(stop_smoking_df.groupby("Entity").std()["EnforceBansTobaccoAd"])

In [5]:
px.histogram(stop_smoking_df.groupby("Entity").std()["AvgTaxesAsPctCigarettePrice"])

## Effect of regulation on tobacco usage (self baseline)

Take tobacco usage in 2018 and subtract by the value in 2000.
Average the regulation since there is little variance for most countries.

In [25]:
def merge_reg_use_with_self_baseline(reg_df, use_df):
    reg_df = reg_df.copy()
    use_df = use_df.copy()

    use_df = use_df.rename(columns={"SpatialDimValueCode": "Code", "Value": "Pct_Smoking"}).query("Gender == 'Both sexes'")
    recent_use_df = use_df[test_df["Year"] == 2018]
    baseline_use_df = use_df[test_df["Year"] == 2000][["Code", "Pct_Smoking"]]
    recent_use_df = pd.merge(recent_use_df, baseline_use_df, on="Code")
    recent_use_df["Pct_Smoking_Change"] = recent_use_df["Pct_Smoking_x"] - recent_use_df["Pct_Smoking_y"]

    reg_df = reg_df.groupby("Code").mean()[["AvgTaxesAsPctCigarettePrice", "EnforceBansTobaccoAd", "AvgCigarettePriceDollars"]]
    reg_use_df = pd.merge(reg_df, recent_use_df, on="Code")
    return reg_use_df

In [30]:
reg_use_df = merge_reg_use_with_self_baseline(stop_smoking_df, ww_use_df)
reg_use_df

Unnamed: 0,Code,AvgTaxesAsPctCigarettePrice,EnforceBansTobaccoAd,AvgCigarettePriceDollars,ParentLocationCode,ParentLocation,Location,Year,Gender,Pct_Smoking_x,Pct_Smoking_y,Pct_Smoking_Change
0,ALB,,5.00,,EUR,Europe,Albania,2018,Both sexes,29.2,34.2,-5.0
1,AND,,2.00,,EUR,Europe,Andorra,2018,Both sexes,33.8,36.3,-2.5
2,ARE,22.80,4.25,1.685,EMR,Eastern Mediterranean,United Arab Emirates,2018,Both sexes,18.2,21.8,-3.6
3,ARG,69.80,3.00,3.130,AMR,Americas,Argentina,2018,Both sexes,21.8,46.2,-24.4
4,ARM,41.25,2.00,1.715,EUR,Europe,Armenia,2018,Both sexes,26.7,34.3,-7.6
...,...,...,...,...,...,...,...,...,...,...,...,...
144,WSM,59.50,3.50,5.960,WPR,Western Pacific,Samoa,2018,Both sexes,28.9,41.3,-12.4
145,YEM,53.30,4.25,1.825,EMR,Eastern Mediterranean,Yemen,2018,Both sexes,20.9,30.9,-10.0
146,ZAF,47.85,4.00,5.320,AFR,Africa,South Africa,2018,Both sexes,31.4,38.1,-6.7
147,ZMB,23.55,2.00,1.745,AFR,Africa,Zambia,2018,Both sexes,14.7,19.9,-5.2


In [31]:
# Random visualization of change in smoking rate by area
recent_use_df_by_area = reg_use_df.groupby("ParentLocation").mean()
px.bar(recent_use_df_by_area, x=recent_use_df_by_area.index, y="Pct_Smoking_Change")

Change in percentage of smokers from 2000-2018 as a function of average tax percentage

Observations:
- Slight correlation between taxes and smoking rate
- Taxes not very effective?

In [32]:
px.scatter(reg_use_df, x="AvgTaxesAsPctCigarettePrice", y="Pct_Smoking_Change", title="Percent Smoker vs. Taxes", trendline="ols")

In [41]:
fig = px.histogram(reg_use_df, x="EnforceBansTobaccoAd", y="Pct_Smoking_Change", histfunc="avg")
fig.update_layout(bargap=0.2)
fig.show()