In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Assuming a regression model
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
data = pd.read_csv('merged_data.csv')

# Verify and print all column names to confirm the disease and smoking columns
print(data.columns)

# Identify all columns that are disease metrics (assuming they start with 'Disease')
disease_columns = [col for col in data.columns if col.startswith('Disease')]

# Calculate the Health Index
weighting_factor_smoking = 1.5  # Smoking might be more impactful, this is arbitrary and for demonstration
number_of_factors = len(disease_columns) + 1  # Number of disease metrics plus one for smoking

# Sum all disease columns and add smoking adjusted by its weighting factor
data['Health_Index'] = (data[disease_columns].sum(axis=1) + data['Smoking_2021'] * weighting_factor_smoking) / number_of_factors

# Prepare features and target variable
X = data[disease_columns + ['Smoking_2021']]  # features include all disease columns and smoking
y = data['Health_Index']  # target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)

print(f'Mean Squared Error: {mse}')


Index(['Country', '1990', '2021.0', 'Absolute Change', 'Relative Change',
       '1990.1', '2021.0.1', 'Absolute Change.1', 'Relative Change.1',
       '1990.2', '2021.0.2', 'Absolute Change.2', 'Relative Change.2',
       'Smoking_1990', 'Smoking_2021', 'Change', 'Percent_Change', 'Disease1',
       'Disease2', 'Disease3', 'Disease4', 'Disease5', 'Disease6', 'Disease7',
       'Disease8', 'Disease9', 'Disease10', 'Disease11', 'Disease12',
       'Disease13', 'Disease14', 'Disease15', 'Disease16', 'Disease17',
       'Disease18', 'Disease19', 'Disease20', 'Disease21', 'Disease22',
       'Disease23', 'Disease24', 'Disease25', 'Disease26', 'Disease27',
       'Disease28', 'Metric1', 'Metric2', 'Population'],
      dtype='object')
Mean Squared Error: 6832347.060235931


In [3]:
print(data.columns)  # This will list all the column names in your DataFrame


Index(['Country', '1990', '2021.0', 'Absolute Change', 'Relative Change',
       '1990.1', '2021.0.1', 'Absolute Change.1', 'Relative Change.1',
       '1990.2', '2021.0.2', 'Absolute Change.2', 'Relative Change.2',
       'Smoking_1990', 'Smoking_2021', 'Change', 'Percent_Change', 'Disease1',
       'Disease2', 'Disease3', 'Disease4', 'Disease5', 'Disease6', 'Disease7',
       'Disease8', 'Disease9', 'Disease10', 'Disease11', 'Disease12',
       'Disease13', 'Disease14', 'Disease15', 'Disease16', 'Disease17',
       'Disease18', 'Disease19', 'Disease20', 'Disease21', 'Disease22',
       'Disease23', 'Disease24', 'Disease25', 'Disease26', 'Disease27',
       'Disease28', 'Metric1', 'Metric2', 'Population'],
      dtype='object')
