In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer


In [3]:
# Load the athletes dataset
athletes = pd.read_csv("D:\_____PROJECT____\Athletes.csv")

# Drop rows with missing values
athletes = athletes.dropna()

# Extract relevant features and target variable
X = athletes[['First Half', 'Second Half', 'Gender']]
y = athletes['Age']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['First Half', 'Second Half']),
        ('cat', OneHotEncoder(), ['Gender'])
    ]
)

X_processed = preprocessor.fit_transform(X)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

In [4]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [5]:
# Define the models and their hyperparameters
models = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.1, 1.0, 10.0]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.1, 1.0, 10.0]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 10, 20]
        }
    },
    'GradientBoosting': {
        'model': GradientBoostingRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    },
    'XGBoost': {
        'model': XGBRegressor(random_state=42),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }
}

In [6]:
# Perform grid search
best_models = {}
for name, model_info in models.items():
    grid_search = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='neg_mean_absolute_error')
    grid_search.fit(X_train_scaled, y_train)
    best_models[name] = {
        'best_estimator': grid_search.best_estimator_,
        'best_score': -grid_search.best_score_,
        'best_params': grid_search.best_params_
    }


In [7]:
# Select the best model based on the lowest MAE
best_model_name = min(best_models, key=lambda k: best_models[k]['best_score'])
best_model = best_models[best_model_name]['best_estimator']

# Train the best model on the entire training set
best_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
r2_percentage = r2 * 100




In [8]:

print(f"Best Model: {best_model_name}")
print("\nBest Model Evaluation on Test Set:")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2:.2f}")
print(f"R-squared as a percentage: {r2_percentage:.2f}%")

# Display a few predictions
print(f"Predicted ages: {y_pred[:5]}")
print(f"Actual ages: {y_test[:5].values}")

Best Model: XGBoost

Best Model Evaluation on Test Set:
Mean Absolute Error (MAE): 8.69876953672172
Mean Squared Error (MSE): 121.20885561145607
Root Mean Squared Error (RMSE): 11.009489343809552
R-squared (R²): 0.25
R-squared as a percentage: 25.27%
Predicted ages: [37.692677 38.923027 50.23438  35.978615 46.246834]
Actual ages: [39 28 54 43 45]


  print(f"Actual ages: {y_test[:5].values}")
