In [1]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Load Data
train_data = pd.read_csv("/kaggle/input/widsdatathon2024-challenge2/train.csv")
test_data = pd.read_csv("/kaggle/input/widsdatathon2024-challenge2/test.csv")

In [3]:
# Drop columns with more than 50% missing values
train_data = train_data.dropna(thresh=len(train_data)*0.5, axis=1)
test_data = test_data.dropna(thresh=len(test_data)*0.5, axis=1)



In [4]:
# Drop the target variable from the train data
X = train_data.drop(columns=['metastatic_diagnosis_period'])  
y = train_data['metastatic_diagnosis_period']

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
# Preprocessing Pipeline
numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])



In [6]:
# Models
models = [
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor()),
    ('gb', GradientBoostingRegressor()),
    ('svr', SVR()),
    ('et', ExtraTreesRegressor())
]

In [7]:
# Voting Regressor
voting_regressor = VotingRegressor(models)

In [8]:

# Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', voting_regressor)])

# Fit the model
pipeline.fit(X_train, y_train)

In [9]:
# Evaluate the model on validation data
val_predictions = pipeline.predict(X_val)
val_rmse = mean_squared_error(y_val, val_predictions, squared=False)
print("Validation RMSE:", val_rmse)



Validation RMSE: 85.20771183661101


In [10]:
# Predict on the test set
test_predictions = pipeline.predict(test_data)



In [11]:
# Prepare submission file
submission = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'metastatic_diagnosis_period': test_predictions
})

# Save submission file
submission.to_csv('voting_regressor_submission.csv', index=False)