In [1]:
import os.path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# for Q-Q plots
import scipy.stats as stats
# from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

from sklearn import metrics

from sklearn.datasets import make_blobs
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV



from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.simplefilter('ignore')

In [2]:
ds = pd.read_csv('D:/3Kurs/1Sem/SS/rgr/Data/Life_Expectancy_Data_fixed_scaled.csv')
ds.head()

Unnamed: 0.1,Unnamed: 0,Year,Life_expectancy,Adult_Mortality,Alcohol,Percentage_expenditure,Hepatitis_B,Measles,BMI,Under_five_deaths,...,Thinness_five_nine_years,Income_composition_of_resources,Schooling,Region_Africa,Region_Asia,Region_Europe,Region_North America,Region_Oceania,Region_South America,Status_Developed
0,0,2015,65.0,263.0,0.01,0.052565,65.0,16,19.1,3.5,...,17.3,0.479,10.1,0,1,0,0,0,0,0
1,1,2014,59.9,271.0,0.01,0.05422,62.0,492,18.6,3.5,...,17.5,0.476,10.0,0,1,0,0,0,0,0
2,2,2013,59.9,268.0,0.01,0.053996,64.0,430,18.1,3.5,...,17.7,0.47,9.9,0,1,0,0,0,0,0
3,3,2012,59.5,272.0,0.01,0.057657,67.0,16,17.6,3.5,...,18.0,0.463,9.8,0,1,0,0,0,0,0
4,4,2011,59.2,275.0,0.01,0.005234,68.0,16,17.2,3.5,...,18.2,0.454,9.5,0,1,0,0,0,0,0


In [3]:
X = ds.drop(['Life_expectancy'], axis=1)
y= ds['Life_expectancy']

In [5]:
X.columns

Index(['Unnamed: 0', 'Year', 'Adult_Mortality', 'Alcohol',
       'Percentage_expenditure', 'Hepatitis_B', 'Measles', 'BMI',
       'Under_five_deaths', 'Polio', 'Total_expenditure', 'Diphtheria',
       'Incidents_HIV', 'GDP', 'Population', 'Thinness_ten_nineteen_years',
       'Thinness_five_nine_years', 'Income_composition_of_resources',
       'Schooling', 'Region_Africa', 'Region_Asia', 'Region_Europe',
       'Region_North America', 'Region_Oceania', 'Region_South America',
       'Status_Developed'],
      dtype='object')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
XGBRegressor = xgb.XGBRegressor(random_state = 42)
XGBRegressor.fit(X_train, y_train)
y_pred = XGBRegressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)  # або np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Виводимо результати
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R-squared (R2):", r2)

Mean Absolute Error (MAE): 1.2642592125608212
Mean Squared Error (MSE): 5.142726391673626
Root Mean Squared Error (RMSE): 2.267758009945864
R-squared (R2): 0.9446906199096887


In [6]:
XGBRegressor.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [7]:
xgb_model = xgb.XGBRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],          
    'max_depth': [3, 5, 7, 9],                 
    'learning_rate': [0.01, 0.05, 0.1],      
    'subsample': [0.6, 0.8, 1.0],              
    'colsample_bytree': [0.6, 0.8, 1.0],       
}

random_search = RandomizedSearchCV(estimator=xgb_model,
                                     param_distributions=param_grid,
                                     scoring='neg_mean_squared_error',
                                     n_iter=100,  
                                     cv=5,
                                     verbose=1,
                                     random_state=42,
                                     n_jobs=-1)

random_search.fit(X_train, y_train)

best_params_random = random_search.best_params_
best_score_random = -random_search.best_score_

print("Best parameters (random reserch):", best_params_random)
print("Best mean squared error (MSE) (random reserch):", best_score_random)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters (random reserch): {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best mean squared error (MSE) (random reserch): 3.5184297859474696
