In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

In [None]:
# CONFIG
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
TARGET_COL = "target"
ID_COL = "id"

MAKE_LABEL_SUBMISSION = True   # this will output predicted numeric values
MAKE_PROB_SUBMISSION = False   # regression doesn’t need probability output

# LOAD DATA
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("\nColumns:", list(train_df.columns))
print("\nMissing values:\n", train_df.isnull().sum())

In [None]:
# EDA
plt.figure(figsize=(8,4))
sns.histplot(train_df[TARGET_COL], kde=True)
plt.title("Target Distribution")
plt.show()

plt.figure(figsize=(8,4))
sns.heatmap(train_df.isnull(), cbar=False)
plt.title("Missing Data")
plt.show()

# Detect types
feature_cols = [c for c in train_df.columns if c not in [TARGET_COL, ID_COL]]
num_cols = train_df[feature_cols].select_dtypes(include=["number"]).columns.tolist()
cat_cols = train_df[feature_cols].select_dtypes(exclude=["number"]).columns.tolist()

print("\nNumeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Boxplots for outliers
for col in num_cols:
    plt.figure(figsize=(8,3))
    sns.boxplot(x=train_df[col])
    plt.title(f"Box Plot: {col}")
    plt.show()

# Correlation Heatmap
if len(num_cols) > 1:
    plt.figure(figsize=(8,5))
    sns.heatmap(train_df[num_cols].corr(), cmap="coolwarm")
    plt.title("Correlation Heatmap")
    plt.show()

In [None]:
# Cleaning
train_df = train_df.drop_duplicates()
train_df = train_df.dropna(subset=[TARGET_COL])

# Outlier Capping (IQR)
for col in num_cols:
    Q1 = train_df[col].quantile(0.25)
    Q3 = train_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    train_df[col] = np.where(train_df[col] < lower, lower, train_df[col])
    train_df[col] = np.where(train_df[col] > upper, upper, train_df[col])

# Train/Test Split
X = train_df.drop([TARGET_COL, ID_COL], axis=1)
y = train_df[TARGET_COL]

test_ids = test_df[ID_COL]
X_test = test_df.drop(ID_COL, axis=1)

# Align features
X, X_test = X.align(X_test, join="left", axis=1)

# Refresh column lists
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(exclude=["number"]).columns.tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Preprocessing
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

# RandomForest + Tuning
rf = RandomForestRegressor(random_state=42)

model = Pipeline([
    ("preprocessor", preprocessor),
    ("reg", rf)
])

param_grid = {
    "reg__n_estimators": [200, 500],
    "reg__max_depth": [None, 10, 20],
    "reg__min_samples_split": [2, 5],
    "reg__min_samples_leaf": [1, 2],
    "reg__max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(model, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("\nBest parameters:", grid_search.best_params_)

# Validation Metrics
y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f"\nMAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

# Scatter plot
plt.figure(figsize=(6,4))
sns.scatterplot(x=y_val, y=y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted")
plt.show()

# Train Final Model
best_model.fit(X, y)

# Predict on Test
test_pred = best_model.predict(X_test)

# Submission
submission = pd.DataFrame({
    ID_COL: test_ids,
    TARGET_COL: test_pred
})

submission.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")