In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [2]:
# Load the datasets
train_data = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

In [3]:
# Extract features and target variable from training data
X_train = train_data.drop(columns=['id', 'FloodProbability'])
y_train = train_data['FloodProbability']

# Extract features from test data
X_test = test_data.drop(columns=['id'])

In [4]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
# Initialize and train the GMM model
gmm_model = GaussianMixture(n_components=5, random_state=42)
gmm_model.fit(X_train_scaled)

In [6]:
# Predict cluster assignments for training data
train_cluster_probs = gmm_model.predict_proba(X_train_scaled)


In [7]:
# Use cluster assignments to train separate Linear Regression models
regression_models = []
for i in range(gmm_model.n_components):
    cluster_indices = np.where(train_cluster_probs.argmax(axis=1) == i)[0]
    cluster_X_train = X_train_scaled[cluster_indices]
    cluster_y_train = y_train.iloc[cluster_indices]
    regression_model = LinearRegression()
    regression_model.fit(cluster_X_train, cluster_y_train)
    regression_models.append(regression_model)

In [8]:
# Predict using the GMM-based regression ensemble
y_test_pred_gmm = np.zeros(len(X_test_scaled))
for i, regression_model in enumerate(regression_models):
    cluster_indices = np.where(gmm_model.predict(X_test_scaled) == i)[0]
    cluster_X_test = X_test_scaled[cluster_indices]
    y_test_pred_gmm[cluster_indices] = regression_model.predict(cluster_X_test)

In [9]:
# Evaluate performance on training data
y_train_pred_gmm = np.zeros(len(X_train_scaled))
for i, regression_model in enumerate(regression_models):
    cluster_indices = np.where(train_cluster_probs.argmax(axis=1) == i)[0]
    cluster_X_train = X_train_scaled[cluster_indices]
    y_train_pred_gmm[cluster_indices] = regression_model.predict(cluster_X_train)

gmm_train_mse = mean_squared_error(y_train, y_train_pred_gmm)
print(f'GMM Train MSE: {gmm_train_mse}')

GMM Train MSE: 0.0004036251138017552


In [10]:
# Prepare submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'FloodProbability': y_test_pred_gmm
})
submission.to_csv('submission.csv', index=False)

print("Submission file saved to 'submission.csv'.")

Submission file saved to 'submission.csv'.
