In [1]:
import pandas as pd

# Load the datasets
age_by_category = pd.read_csv('cleaned_age_by_category.csv')
death_by_smoking = pd.read_csv('cleaned_death_by_smoking.csv')
types_of_diseases = pd.read_csv('cleaned_types_of_diseases.csv')
gender_data = pd.read_csv('cleaned_gender.csv')

# Print column names to identify overlapping columns
print("Age by Category Columns:", age_by_category.columns)
print("Death by Smoking Columns:", death_by_smoking.columns)
print("Types of Diseases Columns:", types_of_diseases.columns)
print("Gender Data Columns:", gender_data.columns)

# Assuming 'Country' is the common column for merging
# Check for any overlapping column names that are not 'Country' and rename them
# Example: renaming columns in death_by_smoking if needed
death_by_smoking.rename(columns={'2021': 'Smoking_2021'}, inplace=True)

# Merging the datasets
from functools import reduce

# List of dataframes to merge
data_frames = [age_by_category, death_by_smoking, types_of_diseases, gender_data]

# Merge all dataframes on 'Country' column using an inner join
df_merged = reduce(lambda  left,right: pd.merge(left, right, on=['Country'],
                                            how='inner'), data_frames)

# Inspect the merged DataFrame
print(df_merged.head())


Age by Category Columns: Index(['Country/area', '1990', '2021.0', 'Absolute Change', 'Relative Change',
       '1990.1', '2021.0.1', 'Absolute Change.1', 'Relative Change.1',
       '1990.2', '2021.0.2', 'Absolute Change.2', 'Relative Change.2'],
      dtype='object')
Death by Smoking Columns: Index(['Albania', '137.5', '94.7', '-42.8', '-0.31'], dtype='object')
Types of Diseases Columns: Index(['Afghanistan', '25371', '1052', '7478', '296', '7168', '3125', '3144',
       '4372', '717', '2764', '25968', '8312', '6286', '13837', '30644',
       '12218', '2449', '588', '810', '687', '3931', '1077', '609', '4502',
       '1873', '2517', '11108', '14025'],
      dtype='object')
Gender Data Columns: Index(['Afghanistan', '0.394', '0.072', '39068978'], dtype='object')


KeyError: 'Country'

In [3]:
import pandas as pd
from functools import reduce

# Load the datasets assuming headers need to be defined for some
age_by_category = pd.read_csv('cleaned_age_by_category.csv')
death_by_smoking = pd.read_csv('cleaned_death_by_smoking.csv', header=None)

gender_data = pd.read_csv('cleaned_gender.csv', header=None)

# Assuming correct headers (verify and adjust according to your data)
death_by_smoking.columns = ['Country', 'Smoking_1990', 'Smoking_2021', 'Change', 'Percent_Change']

gender_data.columns = ['Country', 'Metric1', 'Metric2', 'Population']

# Rename 'Country/area' to 'Country' in age_by_category
age_by_category.rename(columns={'Country/area': 'Country'}, inplace=True)

# List of dataframes to merge
data_frames = [age_by_category, death_by_smoking, gender_data]

# Merge all dataframes on 'Country' column using an inner join
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['Country'], how='inner'), data_frames)

# Inspect the merged DataFrame
print(df_merged.head())


     Country  1990  2021.0  Absolute Change  Relative Change  1990.1  \
0  Singapore   9.9     3.6             -6.3            -0.64   692.3   
1    Ireland  14.5     4.7             -9.8            -0.68  1702.7   
2      Malta  12.9     7.9             -5.0            -0.39   755.5   
3     Brunei  16.9    11.6             -5.3            -0.31  1373.8   
4    Burundi  21.0     6.4            -14.6            -0.70   542.3   

   2021.0.1  Absolute Change.1  Relative Change.1  1990.2  2021.0.2  \
0     187.4             -505.0              -0.73   224.4      48.6   
1     479.4            -1223.3              -0.72   416.1      91.1   
2     248.8             -506.6              -0.67   285.8     110.1   
3     484.8             -889.0              -0.65   347.7     123.4   
4     191.4             -350.9              -0.65   273.3      93.7   

   Absolute Change.2  Relative Change.2  Smoking_1990  Smoking_2021  Change  \
0             -175.8              -0.78          84.4        

In [4]:
# Save the merged DataFrame to a CSV file
df_merged.to_csv('merged_data.csv', index=False)


In [12]:
import pandas as pd
import numpy as np

# Define years and genders for the synthetic data
years = np.arange(1990, 2051, 10)
genders = ['Male', 'Female']

# Generate a DataFrame with all combinations of years and genders
synthetic_data = pd.DataFrame({
    'Year': np.tile(years, len(genders)),
    'Gender': np.repeat(genders, len(years))
})

# Adjusting data generation for smoking trends
# Declining trend for current smokers
initial_current_male = 40  # Starting percentage for males
initial_current_female = 20  # Starting percentage for females

# Slight increase or stable for former smokers
initial_former_male = 5
initial_former_female = 3

# Generate declining trend for current smokers and slight increase for former smokers
decline_rate_current = -2  # decline rate per decade
increase_rate_former = 0.5  # increase rate per decade for former smokers

synthetic_data['Current_Smokers_Percentage'] = np.where(
    synthetic_data['Gender'] == 'Male',
    initial_current_male + decline_rate_current * (synthetic_data['Year'] - 1990) / 10,
    initial_current_female + decline_rate_current * (synthetic_data['Year'] - 1990) / 10
)

synthetic_data['Former_Smokers_Percentage'] = np.where(
    synthetic_data['Gender'] == 'Male',
    initial_former_male + increase_rate_former * (synthetic_data['Year'] - 1990) / 10,
    initial_former_female + increase_rate_former * (synthetic_data['Year'] - 1990) / 10
)

# Calculate numbers assuming a population of 100 million
population = 100  # in millions
synthetic_data['Current_Smokers_Millions'] = (synthetic_data['Current_Smokers_Percentage'] / 100) * population
synthetic_data['Former_Smokers_Millions'] = (synthetic_data['Former_Smokers_Percentage'] / 100) * population

# Load your existing dataset
data_path = 'modified_merged_data1.csv'
original_data = pd.read_csv(data_path)

# Concatenate the synthetic data with your original data
combined_data = pd.concat([original_data, synthetic_data], ignore_index=True, sort=False)

# Save the updated dataset to a new CSV file
updated_csv_path = 'final_merged_data1.csv'
combined_data.to_csv(updated_csv_path, index=False)

print(f"Updated dataset saved to {updated_csv_path}")


Updated dataset saved to final_merged_data1.csv
