In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import numpy as np

# Load the dataset
data = pd.read_csv('TrainingData.csv')

# Convert 'Event Date' to a datetime object and extract relevant features (like month)
data['Event Date'] = pd.to_datetime(data['Event Date'], errors='coerce')
data['Event Month'] = data['Event Date'].dt.month

# Select features and target
features = ['Gender', 'Country', 'Distance', 'Event Month', 'Wind', 'Altitude', 'Age']
target = 'RaceTime'

# Preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Distance', 'Wind', 'Altitude', 'Age', 'Event Month']),
        ('cat', OneHotEncoder(), ['Gender', 'Country'])
    ])

# Split the data into training and testing sets
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to train
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('Support Vector Regression', SVR()),
    ('K-Nearest Neighbors', KNeighborsRegressor())
]

# Dictionary to store the scores
model_scores = {}

# Train and evaluate each model
for name, model in models:
    # Create a pipeline that transforms the data and then fits the model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    
    # Train the model using cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    
    # Mean R-squared score
    mean_cv_score = np.mean(cv_scores)
    model_scores[name] = mean_cv_score
    print(f'{name} cross-validated mean R-squared: {mean_cv_score:.4f}')
    
    # Train the pipeline on the full training data
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Test R-squared score
    test_r2_score = r2_score(y_test, y_pred)
    print(f'{name} test R-squared: {test_r2_score:.4f}')

# Find the best model based on cross-validated R-squared
best_model_name = max(model_scores, key=model_scores.get)
print(f'Best model based on cross-validation: {best_model_name}')


Linear Regression cross-validated mean R-squared: 0.9681
Linear Regression test R-squared: 0.9757
Ridge Regression cross-validated mean R-squared: 0.9681
Ridge Regression test R-squared: 0.9757
Random Forest cross-validated mean R-squared: 0.9961
Random Forest test R-squared: 0.9989
Gradient Boosting cross-validated mean R-squared: 0.9972
Gradient Boosting test R-squared: 0.9994


In [1]:
import pandas as pd

# Load the new dataset
new_data = pd.read_csv('TestData.csv')

# Convert 'Event Date' to datetime and extract the month, and calculate 'Age'
new_data['Event Date'] = pd.to_datetime(new_data['Event Date'], errors='coerce')
new_data['Event Month'] = new_data['Event Date'].dt.month
new_data['Age'] = 2023 - pd.to_datetime(new_data['Date of Birth']).dt.year  # Update the year accordingly

# Select the same features as used in the training
features_new = ['Gender', 'Country', 'Distance', 'Event Month', 'Wind', 'Altitude', 'Age']

# Extracting the features from the new data
X_new = new_data[features_new]

models_dict = {name: model for name, model in models}

# Predict with the best model from the previous step
best_model = models_dict[best_model_name]
print(best_model_name)

# Creating the best pipeline
best_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', best_model)])
best_pipeline.fit(X_train, y_train)  # Make sure this is trained on the entire training dataset

# Now, predict with the best model
predictions = best_pipeline.predict(X_new)

# Assign predictions to the 'X' column of the new_data DataFrame
new_data['X'] = predictions

# Save the modified DataFrame to a new CSV file
new_data.to_csv('UpdatedWithPredictions.csv', index=False)

print("Predictions have been saved to 'UpdatedWithPredictions.csv'")


NameError: name 'models' is not defined