# Data schoonmaken

In deze file worden alle dataframes per grafiek schoongemaakt en geexporteerd als csv's, zodat de Datastory goed leesbaar blijft.

In [69]:
import pandas as pd
import plotly.express as px
import plotly.graph_objs as go 
from plotly.subplots import make_subplots

In [70]:
# Load the Cancer Data Set
cancer_df = pd.read_csv("csv/CancerDeaths.csv")
# print("Cancer Data Set")
# display(cancer_df.head(n=5))

# Load the Emissions Data Set
emissions_df = pd.read_csv("csv/Emissions.csv")
# print("Emissions Data Set")
# display(emissions_df.iloc[250:255])

# Load the Population Data Set
population_df = pd.read_csv("csv/Population.csv")
# print("Population Data Set")
# display(population_df.head(n=5))

# Load the Smoking Data Set
smoking_df = pd.read_csv("csv/smoking.csv")
# print("Smoking Data Set")
# display(smoking_df.head(n=5))

In [71]:
# df's voor Grafiek 1

# Filter the DataFrame for the desired country and year range
cancer_country = 'World'  # Replace with the desired country code
emissions_country = 'Global'  # Replace with the desired country code

cancer_data = cancer_df[(cancer_df['Country'] == cancer_country) & (cancer_df['Year'] >= 2001)]
emissions_data = emissions_df[(emissions_df['Country'] == emissions_country) & (emissions_df['Year'] >= 2001)]

# Extract the Year and Lung cancer columns
year_lung_cancer = cancer_data[['Year', "Tracheal, bronchus, and lung cancer "]]
year_total_emissions = emissions_data[['Year', 'Total']]

# Sla de opgeschoonde data op
output_file_cancer = 'csv/df/cancer_cleaned_data.csv'
output_file_emissions = 'csv/df/emissions_cleaned_data.csv'
year_lung_cancer.to_csv(output_file_cancer, index=False)
year_total_emissions.to_csv(output_file_emissions, index=False)



In [72]:
# df's voor Chloroplets

# Load the datasets
# Load the datasets
df_lung = cancer_df
df_emissions = emissions_df
df_population = population_df

# Strip any leading/trailing whitespace from column names
df_lung.columns = df_lung.columns.str.strip()
df_emissions.columns = df_emissions.columns.str.strip()
df_population.columns = df_population.columns.str.strip()

# Ensure correct data types for population columns
df_population['PopTotal'] = df_population['PopTotal'].str.replace(',', '').astype(float)

# Standardize country names using a mapping dictionary
country_name_mapping = {
    'Russian Federation': 'Russia',
    'United States of America': 'United States',
    'United States of America': 'USA',
    # Add other mappings if necessary
}

# Apply the mapping to the population dataset
df_population['Location'] = df_population['Location'].replace(country_name_mapping)

# Merge the lung cancer data with population data using country names and years
df_lung_merged = pd.merge(df_lung, df_population, left_on=['Country', 'Year'], right_on=['Location', 'Time'])
df_lung_merged['Lung Cancer Per Capita'] = df_lung_merged['Tracheal, bronchus, and lung cancer'] / df_lung_merged['PopTotal']

# Merge the emissions data with population data using country names and years
df_emissions_merged = pd.merge(df_emissions, df_population, left_on=['Country', 'Year'], right_on=['Location', 'Time'])
df_emissions_merged['Emissions Per Capita'] = df_emissions_merged['Total'] / df_emissions_merged['PopTotal']

# Aggregate data by country and code
df_lung_per_capita = df_lung_merged.groupby(['Country', 'Code'])['Lung Cancer Per Capita'].mean().reset_index()
df_emissions_per_capita = df_emissions_merged.groupby(['Country', 'ISO 3166-1 alpha-3'])['Emissions Per Capita'].mean().reset_index()

# Rename the columns for better readability
df_lung_per_capita.columns = ['Country', 'Code', 'Lung Cancer Per Capita']
df_emissions_per_capita.columns = ['Country', 'Code', 'Emissions Per Capita']

output_file_lung = 'csv/df/lung_per_capita.csv'
output_file_emission = 'csv/df/emissions_per_capita.csv'
df_lung_per_capita.to_csv(output_file_lung, index=False)
df_emissions_per_capita.to_csv(output_file_emission, index=False)

In [73]:
# df's voor India/USA plot

country_name_mapping = {
    'Russian Federation': 'Russia',
    'United States of America': 'USA',
    'United States': 'USA',
    # Add other mappings if necessary
}

# Apply the mapping to the emissions dataset
df_emissions['Country'] = df_emissions['Country'].replace(country_name_mapping)

# Apply the mapping to the lung cancer dataset
df_lung['Country'] = df_lung['Country'].replace(country_name_mapping)

# Filter data to include only years 1990-2012
df_lung_filtered = df_lung[(df_lung['Year'] >= 1990) & (df_lung['Year'] <= 2012)]
df_emissions_filtered = df_emissions[(df_emissions['Year'] >= 1990) & (df_emissions['Year'] <= 2012)]

# Merge the lung cancer data with population data
df_lung_merged = pd.merge(df_lung_filtered, df_population, left_on=['Country', 'Year'], right_on=['Location', 'Time'])
df_lung_merged['Lung Cancer Per Capita'] = df_lung_merged['Tracheal, bronchus, and lung cancer'] / df_lung_merged['PopTotal']

# Merge the emissions data with population data
df_emissions_merged = pd.merge(df_emissions_filtered, df_population, left_on=['Country', 'Year'], right_on=['Location', 'Time'])
df_emissions_merged['Emissions Per Capita'] = df_emissions_merged['Total'] / df_emissions_merged['PopTotal']

# Aggregate data by country, year, and code
df_lung_per_capita = df_lung_merged.groupby(['Country', 'Year', 'Code'])['Lung Cancer Per Capita'].mean().reset_index()
df_emissions_per_capita = df_emissions_merged.groupby(['Country', 'Year', 'ISO 3166-1 alpha-3'])['Emissions Per Capita'].mean().reset_index()

# Rename the columns for better readability
df_lung_per_capita.columns = ['Country', 'Year', 'Code', 'Lung Cancer Per Capita']
df_emissions_per_capita.columns = ['Country', 'Year', 'Code', 'Emissions Per Capita']

# Merge the two datasets on country code and year
df_combined = pd.merge(df_lung_per_capita, df_emissions_per_capita, on=['Country', 'Year', 'Code'])

output_file_combined = 'csv/df/combined_usa_india.csv'
df_combined.to_csv(output_file_combined, index=False)

In [74]:
# df's voor correlatieplot

# Filter data to include only years 2000-2012
df_lung_filtered = df_lung[(df_lung['Year'] >= 2000) & (df_lung['Year'] <= 2012)]
df_emissions_filtered = df_emissions[(df_emissions['Year'] >= 2000) & (df_emissions['Year'] <= 2012)]
df_population_filtered = df_population[(df_population['Time'] >= 2000) & (df_population['Time'] <= 2012)]

# Extract relevant columns
df_lung_relevant = df_lung_filtered[['Country', 'Year', 'Tracheal, bronchus, and lung cancer']]
df_emissions_relevant = df_emissions_filtered[['Country', 'Year', 'Total']]
df_population_relevant = df_population_filtered[['Location', 'Time', 'PopTotal']]

# Merge datasets on Country and Year
df_merged = pd.merge(df_lung_relevant, df_emissions_relevant, left_on=['Country', 'Year'], right_on=['Country', 'Year'])
df_merged = pd.merge(df_merged, df_population_relevant, left_on=['Country', 'Year'], right_on=['Location', 'Time'])

# Calculate per capita values
df_merged['LungCancerPerCapita'] = df_merged['Tracheal, bronchus, and lung cancer'] / df_merged['PopTotal']
df_merged['EmissionsPerCapita'] = df_merged['Total'] / df_merged['PopTotal']

# Select only the numeric columns
df_numeric = df_merged[['Year', 'LungCancerPerCapita', 'EmissionsPerCapita']]

# Group by Year and average the per capita values
df_grouped = df_numeric.groupby('Year').mean().reset_index()

output_file_grouped = 'csv/df/grouped_df.csv'
df_grouped.to_csv(output_file_grouped, index=False)

In [75]:
# chloropleth rokers 

# Load the smoking data CSV file
smoking_file_path = 'csv/smoking.csv'
smoking_data = smoking_df

# Load the population data CSV file
population_file_path = 'csv/Population.csv'
population_data = pd.read_csv(population_file_path)

# Standardize country names using a mapping dictionary
country_name_mapping = {
    "United States of America": "United States",
    'Russian Federation': 'Russia',
    # Add more mappings as needed
}

# Apply country name mapping to smoking data
smoking_data['Country'] = smoking_data['Country'].replace(country_name_mapping)

# Apply country name mapping to population data
population_data['Location'] = population_data['Location'].replace(country_name_mapping)


# Remove commas from the 'Population' column and convert to numeric
population_data['PopTotal'] = population_data['PopTotal'].str.replace(',', '').astype(float)

# # Convert the 'Population' column to numeric, forcing errors to NaN
# population_data['PopTotal'] = pd.to_numeric(population_data['PopTotal'], errors='coerce')

# Filter smoking data for the years 2000 to 2012
filtered_smoking_data = smoking_data[(smoking_data['Year'] >= 2000) & (smoking_data['Year'] <= 2012)]

# Group by country and calculate the average number of smokers
avg_smokers = filtered_smoking_data.groupby('Country')['Data.Smokers.Total'].mean().reset_index()

# Calculate the average population for the same period
filtered_population_data = population_data[(population_data['Time'] >= 2000) & (population_data['Time'] <= 2012)]
avg_population = filtered_population_data.groupby('Location')['PopTotal'].mean().reset_index()

# Rename columns to facilitate merging
avg_population.rename(columns={'Location': 'Country'}, inplace=True)

# Merge the average smokers and average population dataframes on the country column
merged_data = pd.merge(avg_smokers, avg_population, on='Country')

# Calculate smokers per capita
merged_data['Smokers_Per_Capita'] = merged_data['Data.Smokers.Total'] / merged_data['PopTotal']

output_file_merged = 'csv/df/merged_smokers_df.csv'
merged_data.to_csv(output_file_merged, index=False)

In [76]:
# correlatie plot roken en longkanker

smoking_data['Country'] = smoking_data['Country'].replace(country_name_mapping)
# Filter data to include only years 2000-2012
df_smoking_filtered = smoking_data[(smoking_data['Year'] >= 2000) & (smoking_data['Year'] <= 2012)]

# Extract relevant columns
df_smoking_relevant = df_smoking_filtered[['Country', 'Year', 'Data.Smokers.Total']]


# Merge datasets on Country and Year
df_merged = pd.merge(df_lung_relevant, df_smoking_relevant, left_on=['Country', 'Year'], right_on=['Country', 'Year'])
df_merged = pd.merge(df_merged, df_population_relevant, left_on=['Country', 'Year'], right_on=['Location', 'Time'])

# Calculate per capita values
df_merged['LungCancerPerCapita'] = df_merged['Tracheal, bronchus, and lung cancer'] / df_merged['PopTotal']
df_merged['SmokersPerCapita'] = df_merged['Data.Smokers.Total'] / df_merged['PopTotal']

# Select only the numeric columns
df_numeric = df_merged[['Year', 'LungCancerPerCapita', 'SmokersPerCapita']]

# Group by Year and average the per capita values
df_grouped = df_numeric.groupby('Year').mean().reset_index()

output_file_grouped = 'csv/df/grouped_smokers_df.csv'
df_grouped.to_csv(output_file_grouped, index=False)