# Run the best model on the entire dataset

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [6]:
path = 'lifeexpectancy_test_updated.csv'
df = pd.read_csv(path)

In [9]:
# Assuming 'df' is the DataFrame containing the full dataset
df.dropna()

# Separate features and target variable before normalization
X = df.drop(['Age at death', 'mortality rate', 'mean household income'], axis=1)
y = df['Age at death']

# Normalize the features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)  # Directly normalizing X as there are no missing values
X_normalized = pd.DataFrame(X_normalized, columns=X.columns)  # Convert back to DataFrame

# Now perform the train-test split
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Initialize the best model - RandomForestRegressor
best_model = RandomForestRegressor(max_depth=None, n_estimators=200, random_state=42)

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Test MSE: {mse}')
print(f'Test R^2: {r2}')

Test MSE: 0.41863114861608136
Test R^2: 0.9950783031926469
