<a href="https://colab.research.google.com/github/RishabhXYZA/Kaggle-TSS-Hack-2/blob/main/Thapar_Summer_School_Kaggel_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Kaggle TSS Hack-2**
**(Evaluation on MAE)**

**Using Linear Regression**

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load the training data
train_df = pd.read_csv("train.csv")

# Define features and target
X = train_df.drop(columns=['id', 'Row#', 'yield'])  # Input features
y = train_df['yield']                               # Target variable

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X, y)

# Make predictions
y_pred = model.predict(X)

# Calculate MAE
mae = mean_absolute_error(y, y_pred)

# Print MAE
print(f"Mean Absolute Error (MAE): {mae:.2f}")


Mean Absolute Error (MAE): 272.00


In [None]:
test_df=pd.read_csv("test.csv")
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Train model
model = LinearRegression()
model.fit(X,y)

# Predict on test data
test_predictions = model.predict(X_test)

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_predictions
})

# Save to CSV
submission.to_csv("submission.csv", index=False)

print("Predictions saved to submission.csv")

Predictions saved to submission.csv


**Using Random Forest Regressor**

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']

# Prepare test data
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Initialize and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on training set and compute MAE
train_predictions = model.predict(X_train)
mae = mean_absolute_error(y_train, train_predictions)
print(f"Mean Absolute Error on training set: {mae:.2f}")

# Predict on test set
test_predictions = model.predict(X_test)

# Create and save submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_predictions
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set: 95.15
Test predictions saved to submission.csv


**Using Linear Regression with polynomial feature**

In [None]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']

# Prepare test data
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Apply Polynomial Features transformation
poly = PolynomialFeatures(degree=3, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train Linear Regression on polynomial features
model = LinearRegression()
model.fit(X_train_poly, y_train)

# Predict on training set and calculate MAE
train_preds = model.predict(X_train_poly)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (Polynomial Regression): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test_poly)

# Create submission dataframe
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set (Polynomial Regression): 254.60
Test predictions saved to submission.csv


**Using Linear Regression with PCA and Outlier removal**

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# ---------------------------
# Step 1: Outlier Removal using Z-score
# ---------------------------
train_clean = train_df.copy()
features = train_clean.drop(columns=['id', 'Row#', 'yield'])

# Compute Z-scores and remove rows with any feature Z-score > 3
z_scores = np.abs(zscore(features))
train_clean = train_clean[(z_scores < 3).all(axis=1)]

# ---------------------------
# Step 2: Prepare Data
# ---------------------------
X_train = train_clean.drop(columns=['id', 'Row#', 'yield'])
y_train = train_clean['yield']

X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Standardize the data before PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# Step 3: Apply PCA
# ---------------------------
# Retain 95% variance
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# ---------------------------
# Step 4: Train Linear Regression and Calculate MAE
# ---------------------------
model = LinearRegression()
model.fit(X_train_pca, y_train)

train_preds = model.predict(X_train_pca)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on cleaned training set (with PCA): {mae:.2f}")


test_preds = model.predict(X_test_pca)

submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})

submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on cleaned training set (with PCA): 308.61
Test predictions saved to submission.csv


**Using AdaBoost Regressor**

In [None]:
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training and test data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Initialize AdaBoost Regressor with a Decision Tree base estimator
model = AdaBoostRegressor(
    estimator=DecisionTreeRegressor(max_depth=4),
    n_estimators=100,
    learning_rate=0.5,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict on training set and calculate MAE
train_preds = model.predict(X_train)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (AdaBoost): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test)


submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")

Mean Absolute Error on training set (AdaBoost): 413.64
Test predictions saved to submission.csv


**Using Ridge Regression**

In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training and test data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Standardize features (important for Ridge Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Ridge Regression model
model = Ridge(alpha=1.0, random_state=42)
model.fit(X_train_scaled, y_train)

# Predict on training set and compute MAE
train_preds = model.predict(X_train_scaled)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (Ridge): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test_scaled)

# Create submission file
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set (Ridge): 272.01
Test predictions saved to submission.csv


**Using LGBM Regressor Model**

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training and test data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Initialize LightGBM Regressor
model = LGBMRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=-1,
    random_state=42
)

# Train the model
model.fit(X_train, y_train)

# Predict on training data and calculate MAE
train_preds = model.predict(X_train)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (LightGBM): {mae:.2f}")

# Predict on test data
test_preds = model.predict(X_test)

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})

submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000970 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 850
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 16
[LightGBM] [Info] Start training from score 6007.246244
Mean Absolute Error on training set (LightGBM): 227.03
Test predictions saved to submission.csv


**Using Decision Tree Model**

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare training and test data
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Initialize and train Decision Tree Regressor
model = DecisionTreeRegressor(max_depth=None, random_state=42)
model.fit(X_train, y_train)

# Predict on training set and calculate MAE
train_preds = model.predict(X_train)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (Decision Tree): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test)

# Save predictions to CSV
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set (Decision Tree): 0.00
Test predictions saved to submission.csv


**Using KNN model**

In [None]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare features and target
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Standardize features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train KNN Regressor
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train_scaled, y_train)

# Predict on training data and calculate MAE
train_preds = model.predict(X_train_scaled)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (KNN): {mae:.2f}")

# Predict on test data
test_preds = model.predict(X_test_scaled)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set (KNN): 270.11
Test predictions saved to submission.csv


**Using Random Forest Regressor with PCA and Outlier removal**

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# -----------------------------
# Step 1: Remove Outliers using Z-score
# -----------------------------
train_clean = train_df.copy()
X_raw = train_clean.drop(columns=['id', 'Row#', 'yield'])
z_scores = np.abs(zscore(X_raw))

# Keep rows where all z-scores are < 3
train_clean = train_clean[(z_scores < 3).all(axis=1)]

# -----------------------------
# Step 2: Prepare Features
# -----------------------------
X_train = train_clean.drop(columns=['id', 'Row#', 'yield'])
y_train = train_clean['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# -----------------------------
# Step 3: Scale and Apply PCA
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_pca, y_train)

train_preds = model.predict(X_train_pca)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on cleaned training set (RF + PCA): {mae:.2f}")

test_preds = model.predict(X_test_pca)

submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on cleaned training set (RF + PCA): 117.81
Test predictions saved to submission.csv


**Using Orthagonal Matching Pursuit**

In [None]:
import pandas as pd
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare features and target
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Standardize features (OMP is sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train Orthogonal Matching Pursuit
model = OrthogonalMatchingPursuit()
model.fit(X_train_scaled, y_train)

# Predict and compute MAE
train_preds = model.predict(X_train_scaled)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (OMP): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test_scaled)

# Save to CSV
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


Mean Absolute Error on training set (OMP): 282.48
Test predictions saved to submission.csv


**Using LGBM with Hypertuned parameters**

In [None]:
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare features and target
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# OPTIONAL: Standardize (LightGBM doesn't require it but can help in some cases)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# --------------------------
# Define and train LGBM model with tuned hyperparameters
# --------------------------
model = LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(X_train, y_train)

# --------------------------
# Evaluate on training data
# --------------------------
train_preds = model.predict(X_train)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error on training set (LGBM with hyperparams): {mae:.2f}")

# --------------------------
# Predict on test set
# --------------------------
test_preds = model.predict(X_test)

# Save predictions
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission.csv", index=False)
print("Test predictions saved to submission.csv")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001282 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 850
[LightGBM] [Info] Number of data points in the train set: 15000, number of used features: 16
[LightGBM] [Info] Start training from score 6007.246244
Mean Absolute Error on training set (LGBM with hyperparams): 231.72
Test predictions saved to submission.csv


**Using LGBM with PCA and Outlier Removal**

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error
from scipy.stats import zscore

# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Step 1: Remove Outliers using Z-score
X_raw = train_df.drop(columns=['id', 'Row#', 'yield'])
z_scores = np.abs(zscore(X_raw))
train_clean = train_df[(z_scores < 3).all(axis=1)]

# Step 2: Prepare Features
X_train = train_clean.drop(columns=['id', 'Row#', 'yield'])
y_train = train_clean['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Step 3: Scale and Apply PCA
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Step 4: Train LGBM and Calculate MAE
model = LGBMRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)
model.fit(X_train_pca, y_train)

train_preds = model.predict(X_train_pca)
mae = mean_absolute_error(y_train, train_preds)
print(f"MAE on training set with PCA and outlier removal: {mae:.2f}")

# Step 5: Predict and Save
test_preds = model.predict(X_test_pca)
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission_lgbm_pca_outliers.csv", index=False)
print("Predictions saved to submission_lgbm_pca_outliers.csv")




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001781 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 14860, number of used features: 6
[LightGBM] [Info] Start training from score 6029.541639




MAE on training set with PCA and outlier removal: 265.45




Predictions saved to submission_lgbm_pca_outliers.csv


**Using Bayesian Ridge Model**

In [None]:
import pandas as pd
from sklearn.linear_model import BayesianRidge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

# Load datasets
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Prepare features and target
X_train = train_df.drop(columns=['id', 'Row#', 'yield'])
y_train = train_df['yield']
X_test = test_df.drop(columns=['id', 'Row#'], errors='ignore')

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Bayesian Ridge Regressor
model = BayesianRidge()
model.fit(X_train_scaled, y_train)

# Predict and evaluate
train_preds = model.predict(X_train_scaled)
mae = mean_absolute_error(y_train, train_preds)
print(f"Mean Absolute Error (Bayesian Ridge): {mae:.2f}")

# Predict on test set
test_preds = model.predict(X_test_scaled)

# Save predictions to CSV
submission = pd.DataFrame({
    'id': test_df['id'],
    'yield': test_preds
})
submission.to_csv("submission_bayesian_ridge.csv", index=False)
print("Predictions saved to submission_bayesian_ridge.csv")


Mean Absolute Error (Bayesian Ridge): 272.01
Predictions saved to submission_bayesian_ridge.csv


**Using All Models with PCA and Outlier Removal**

In [None]:
# Re-run the model evaluation pipeline for the re-uploaded files

# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor, BayesianRidge
from sklearn.linear_model import OrthogonalMatchingPursuit, LassoLars
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from scipy.stats import zscore

# Load data
train_df = pd.read_csv("train.csv")

# Drop identifiers
X = train_df.drop(columns=['id', 'Row#', 'yield'])
y = train_df['yield']

# Outlier removal using Z-score
z_scores = np.abs(zscore(X))
X_clean = X[(z_scores < 3).all(axis=1)]
y_clean = y[X_clean.index]

# Model dictionary
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "BayesianRidge": BayesianRidge(),
    "HuberRegressor": HuberRegressor(),
    "OrthogonalMatchingPursuit": OrthogonalMatchingPursuit(),
    "LassoLars": LassoLars(),
    "DecisionTree": DecisionTreeRegressor(max_depth=10),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "LGBM": LGBMRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
}

# Evaluate models
results = []
for name, model in models.items():
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("pca", PCA(n_components=0.95, random_state=42)),
        ("model", model)
    ])
    pipeline.fit(X_clean, y_clean)
    preds = pipeline.predict(X_clean)
    mae = mean_absolute_error(y_clean, preds)
    results.append({"Model": name, "MAE (Train)": round(mae, 2)})

# Create results table
results_df = pd.DataFrame(results).sort_values(by="MAE (Train)").reset_index(drop=True)
results_df




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001475 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1530
[LightGBM] [Info] Number of data points in the train set: 14860, number of used features: 6
[LightGBM] [Info] Start training from score 6029.541639




Unnamed: 0,Model,MAE (Train)
0,RandomForest,117.81
1,LGBM,265.45
2,DecisionTree,266.72
3,KNN,273.87
4,HuberRegressor,305.87
5,BayesianRidge,308.61
6,LinearRegression,308.61
7,Ridge,308.61
8,Lasso,308.61
9,LassoLars,308.67
