In [1]:
import pandas as pd

#https://www.kaggle.com/datasets/programmerrdai/cancer?select=cancer-death-rates-by-type.csv
df_cancer_rate = pd.read_csv('datasets/cancer-death-rates-by-type.csv')
#https://ourworldindata.org/explorers/air-pollution?tab=table&uniformYAxis=0&Pollutant=All+pollutants&Sector=From+all+sectors+%28Total%29&Per+capita=false&country=USA~CHN~IND~GBR~OWID_WRL
df_pollution = pd.read_csv('datasets/air-pollution.csv')
#https://ourworldindata.org/grapher/daily-smoking-prevalence-bounds?tab=chart
df_smoking = pd.read_csv('datasets/smoking.csv')

In [2]:
# Select columns
df_cancer_rate = df_cancer_rate[['Code', 'Year', 'Deaths - Tracheal, bronchus, and lung cancer - Sex: Both - Age: Age-standardized (Rate)']]
# Rename columns
df_cancer_rate = df_cancer_rate.rename(columns={'Deaths - Tracheal, bronchus, and lung cancer - Sex: Both - Age: Age-standardized (Rate)':'TBL rate'})
# Delete non-country entries
df_cancer_rate = df_cancer_rate[df_cancer_rate['Code'].notna()]
# Display dataframe
df_cancer_rate.head(5)

Unnamed: 0,Code,Year,TBL rate
0,AFG,1990,14.069657
1,AFG,1991,13.80559
2,AFG,1992,13.591141
3,AFG,1993,13.422799
4,AFG,1994,13.250556


In [3]:
# Select columns
df_smoking = df_smoking[['Code', 'Year', 'Daily smoking prevalence - both (IHME, GHDx (2012))']]
# Rename columns
df_smoking = df_smoking.rename(columns={'Daily smoking prevalence - both (IHME, GHDx (2012))':'Percentage daily smoker'})
# Delete non-country entries
df_smoking = df_smoking[df_smoking['Code'].notna()]
# Display dataframe
df_smoking.head(5)

Unnamed: 0,Code,Year,Percentage daily smoker
0,AFG,1980,10.4
1,AFG,1981,10.5
2,AFG,1982,10.5
3,AFG,1983,10.5
4,AFG,1984,10.6


In [4]:
# Select columns
df_pollution = df_pollution.drop('Entity', axis=1)
# Delete non-country entries
df_pollution = df_pollution[df_pollution['Code'].notna()]
# Display dataframe
df_pollution.head(5)

Unnamed: 0,Code,Year,Nitrogen oxide (NOx),Sulphur dioxide (SO₂) emissions,Carbon monoxide (CO) emissions,Organic carbon (OC) emissions,Non-methane volatile organic compounds (NMVOC) emissions,Black carbon (BC) emissions,Ammonia (NH₃) emissions
0,AFG,1750,555.4786,174.87167,142073.31,5456.885,13596.633,1633.0308,7681.0464
1,AFG,1760,578.50757,181.99332,147859.23,5679.1167,14150.87,1699.5359,8000.8574
2,AFG,1770,602.4798,189.3885,153867.4,5909.884,14726.47,1768.5956,8333.961
3,AFG,1780,627.4322,197.06535,160104.42,6149.441,15324.077,1840.2854,8680.906
4,AFG,1790,653.4031,205.03189,166576.77,6398.037,15944.332,1914.6805,9042.266


In [5]:
# Select years to collect data from
begin_year = 1990
end_year = 2012
df_cancer_year = df_cancer_rate[df_cancer_rate['Year'].between(begin_year, end_year)]
df_smoking_year = df_smoking[df_smoking['Year'].between(begin_year, end_year)]
df_pollution_year = df_pollution[df_pollution['Year'].between(begin_year, end_year)]

In [6]:
# Merge dataframes on country code and year
df_combined = pd.merge(left=df_smoking_year, right=df_cancer_year, how='left', on=['Code', 'Year'])
df_combined = pd.merge(left=df_pollution_year, right=df_combined, how='left', on=['Code', 'Year'])
# Only select entries that are represented in all datasets
df_combined = df_combined[df_combined['TBL rate'].notna()]

# Display data
df_combined.head(5)

Unnamed: 0,Code,Year,Nitrogen oxide (NOx),Sulphur dioxide (SO₂) emissions,Carbon monoxide (CO) emissions,Organic carbon (OC) emissions,Non-methane volatile organic compounds (NMVOC) emissions,Black carbon (BC) emissions,Ammonia (NH₃) emissions,Percentage daily smoker,TBL rate
0,AFG,1990,425144.75,12876.961,1013430.94,28106.21,404866.4,8362.603,73274.35,10.8,14.069657
1,AFG,1991,413349.72,12671.984,983752.06,28803.91,381666.22,8494.117,77547.38,10.9,13.80559
2,AFG,1992,272757.1,7732.831,654986.94,29685.04,242334.94,8487.974,83017.66,11.0,13.591141
3,AFG,1993,276675.4,7967.0625,662752.9,30824.832,240105.75,8756.007,89469.49,11.1,13.422799
4,AFG,1994,252820.98,7698.993,657333.0,32070.424,234383.25,9055.427,95695.22,11.2,13.250556


In [7]:
# Save to csv
df_combined.to_csv('datasets/combined.csv')