In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import tensorflow as tf
import tensorflow_probability as tfp


In [2]:
# Load the train and test datasets
train_data = pd.read_csv('/kaggle/input/playground-series-s3e25/train.csv')
test_data = pd.read_csv('/kaggle/input/playground-series-s3e25/test.csv')

In [3]:
# Feature selection
features = ['allelectrons_Total', 'density_Total', 'allelectrons_Average',
            'val_e_Average', 'atomicweight_Average', 'ionenergy_Average',
            'el_neg_chi_Average', 'R_vdw_element_Average', 'R_cov_element_Average',
            'zaratio_Average', 'density_Average']

In [4]:
# Separate the features and the target variable
X = train_data[features]
y = train_data['Hardness']


In [5]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [7]:
# RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_leaf=2, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_val_predictions = rf_model.predict(X_val_scaled)
rf_mae = mean_absolute_error(y_val, rf_val_predictions)
print(f"RandomForest MAE on Validation Set: {rf_mae}")

RandomForest MAE on Validation Set: 0.9123868098952751


In [8]:
# LGBMRegressor
lgb_model = LGBMRegressor()
lgb_model.fit(X_train_scaled, y_train)
lgb_val_predictions = lgb_model.predict(X_val_scaled)
lgb_mae = mean_absolute_error(y_val, lgb_val_predictions)
print(f"LGBM MAE on Validation Set: {lgb_mae}")

LGBM MAE on Validation Set: 0.8986148497362075


In [9]:
# CatBoostRegressor
catboost_model = CatBoostRegressor()
catboost_model.fit(X_train_scaled, y_train)
catboost_val_predictions = catboost_model.predict(X_val_scaled)
catboost_mae = mean_absolute_error(y_val, catboost_val_predictions)
print(f"CatBoost MAE on Validation Set: {catboost_mae}")


Learning rate set to 0.057227
0:	learn: 1.6577228	total: 58.9ms	remaining: 58.8s
1:	learn: 1.6298259	total: 62.3ms	remaining: 31.1s
2:	learn: 1.6028105	total: 65.5ms	remaining: 21.8s
3:	learn: 1.5778444	total: 68.8ms	remaining: 17.1s
4:	learn: 1.5552126	total: 72.2ms	remaining: 14.4s
5:	learn: 1.5346130	total: 75.6ms	remaining: 12.5s
6:	learn: 1.5151060	total: 78.9ms	remaining: 11.2s
7:	learn: 1.4969527	total: 82.3ms	remaining: 10.2s
8:	learn: 1.4809247	total: 86.6ms	remaining: 9.54s
9:	learn: 1.4661157	total: 90.9ms	remaining: 9s
10:	learn: 1.4519091	total: 94.4ms	remaining: 8.48s
11:	learn: 1.4386087	total: 97.8ms	remaining: 8.05s
12:	learn: 1.4285161	total: 101ms	remaining: 7.67s
13:	learn: 1.4174654	total: 104ms	remaining: 7.34s
14:	learn: 1.4068424	total: 107ms	remaining: 7.05s
15:	learn: 1.3970505	total: 110ms	remaining: 6.79s
16:	learn: 1.3889331	total: 117ms	remaining: 6.78s
17:	learn: 1.3809595	total: 121ms	remaining: 6.62s
18:	learn: 1.3733801	total: 125ms	remaining: 6.47s
19

In [10]:
# Ensemble using VotingRegressor
ensemble_model = VotingRegressor([('RandomForest', rf_model), ('LGBM', lgb_model), ('CatBoost', catboost_model)])
ensemble_model.fit(X_train_scaled, y_train)
ensemble_val_predictions = ensemble_model.predict(X_val_scaled)
ensemble_mae = mean_absolute_error(y_val, ensemble_val_predictions)
print(f"Ensemble Model MAE on Validation Set: {ensemble_mae}")

Learning rate set to 0.057227
0:	learn: 1.6577228	total: 5.86ms	remaining: 5.86s
1:	learn: 1.6298259	total: 10.4ms	remaining: 5.18s
2:	learn: 1.6028105	total: 13.6ms	remaining: 4.51s
3:	learn: 1.5778444	total: 17ms	remaining: 4.23s
4:	learn: 1.5552126	total: 20.3ms	remaining: 4.05s
5:	learn: 1.5346130	total: 24.5ms	remaining: 4.06s
6:	learn: 1.5151060	total: 32.5ms	remaining: 4.61s
7:	learn: 1.4969527	total: 38.9ms	remaining: 4.83s
8:	learn: 1.4809247	total: 42.7ms	remaining: 4.7s
9:	learn: 1.4661157	total: 46ms	remaining: 4.55s
10:	learn: 1.4519091	total: 49.1ms	remaining: 4.42s
11:	learn: 1.4386087	total: 52.4ms	remaining: 4.31s
12:	learn: 1.4285161	total: 55.7ms	remaining: 4.23s
13:	learn: 1.4174654	total: 59ms	remaining: 4.16s
14:	learn: 1.4068424	total: 62.3ms	remaining: 4.09s
15:	learn: 1.3970505	total: 65.6ms	remaining: 4.04s
16:	learn: 1.3889331	total: 68.8ms	remaining: 3.98s
17:	learn: 1.3809595	total: 72ms	remaining: 3.93s
18:	learn: 1.3733801	total: 75.3ms	remaining: 3.89s
1

In [11]:
# Make predictions on the test set using the ensemble model
test_data_scaled = scaler.transform(test_data[features])
test_predictions = ensemble_model.predict(test_data_scaled)

In [12]:
# Create the submission dataframe
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': test_predictions})

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission_improved.csv', index=False)