In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import median_absolute_error


In [2]:
# Load the datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s3e25/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e25/test.csv')
submission_sample = pd.read_csv('/kaggle/input/playground-series-s3e25/sample_submission.csv')


In [3]:
# Data Preprocessing
columns_to_drop = ['id']
train_data = train_data.drop(columns=columns_to_drop)
test_data = test_data.drop(columns=columns_to_drop)

In [4]:
# Separate features and target variable
X = train_data.drop(columns=['Hardness'])
y = train_data['Hardness']


In [5]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Model Selection and Training with Hyperparameter Tuning
param_dist = {
    'n_estimators': [800, 1000],
    'learning_rate': [0.01, 0.05],
    'max_depth': [5, 7],
    'min_child_weight': [1, 3]
}

random_search = RandomizedSearchCV(
    XGBRegressor(random_state=42),
    param_distributions=param_dist,
    n_iter=5,  # Adjust the number of iterations as needed
    cv=3,
    scoring='neg_median_absolute_error',
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_

In [7]:
# Model Evaluation
val_predictions = best_model.predict(X_val)
medae = median_absolute_error(y_val, val_predictions)
print(f'Median Absolute Error on Validation Set: {medae}')

Median Absolute Error on Validation Set: 0.6611325502395631


In [8]:
# Prediction on the Test Set
test_predictions = best_model.predict(test_data)

In [9]:
# Create Submission File
submission_df = pd.DataFrame({'id': submission_sample['id'], 'Hardness': test_predictions})
submission_df.to_csv('improved_submission.csv', index=False)

In [10]:
# Display the first few rows of the improved submission file
print(submission_df.head())

      id  Hardness
0  10407  2.725334
1  10408  2.662307
2  10409  5.669130
3  10410  3.670359
4  10411  5.304950
