In [1]:
import pandas as pd
import numpy as np


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor

# Load your dataset
df = pd.read_csv("./train.csv")

# Drop columns with too many missing values
missing_threshold = 0.3
df = df.drop(columns=df.columns[df.isnull().mean() > missing_threshold])

# Keep only important features
important_features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'LotArea',
    '1stFlrSF', 'TotRmsAbvGrd', 'MasVnrArea', 'BsmtFinSF1',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'GarageType', 'GarageFinish',
    'SalePrice'
]
df = df[[col for col in important_features if col in df.columns]]

# Label encode categorical columns
label_cols = df.select_dtypes(include='object').columns
for col in label_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Split features and target
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost
model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae:.2f}")


Mean Absolute Error: 17840.49


In [4]:
# Retry with faster training settings
model_fast = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model_fast.fit(X_train, y_train)

# Predict and evaluate
y_pred_fast = model_fast.predict(X_test)
mae_fast = mean_absolute_error(y_test, y_pred_fast)
mae_fast


17840.491411601026

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# Load data
df = pd.read_csv("./train.csv")

# Drop columns with too many missing values
df = df.drop(columns=df.columns[df.isnull().mean() > 0.3])

# Select relevant columns
features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'LotArea',
    '1stFlrSF', 'TotRmsAbvGrd', 'MasVnrArea', 'BsmtFinSF1',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'GarageType', 'GarageFinish',
    'SalePrice'
]
df = df[[col for col in features if col in df.columns]]

# Encode categorical features
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Split into features and target
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Normalize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build model
model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.1),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(128, activation='relu'),
    layers.Dense(1)
])

# Compile model
model.compile(optimizer='adam', loss='mae')

# Early stopping
early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=200,
    batch_size=32,
    callbacks=[early_stop],
    verbose=0
)

# Predict and evaluate
y_pred = model.predict(X_test).flatten()
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE with TensorFlow: {mae:.2f}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
MAE with TensorFlow: 59549.66


In [1]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

# Load data
df = pd.read_csv("./train.csv")

# Drop columns with >30% missing values
df = df.drop(columns=df.columns[df.isnull().mean() > 0.3])

# Select important features
features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'LotArea',
    '1stFlrSF', 'TotRmsAbvGrd', 'MasVnrArea', 'BsmtFinSF1',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'GarageType', 'GarageFinish',
    'SalePrice'
]
df = df[[col for col in features if col in df.columns]]

# Encode categorical variables
for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Split into features and target
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train CatBoost model
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=3,
    verbose=100,
    random_state=0
)
model.fit(X_train, y_train)


# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE using CatBoost: {mae:.2f}")


0:	learn: 74955.4669979	total: 140ms	remaining: 2m 20s
100:	learn: 26619.7213692	total: 213ms	remaining: 1.89s
200:	learn: 21600.6195556	total: 273ms	remaining: 1.08s
300:	learn: 19224.6348216	total: 331ms	remaining: 769ms
400:	learn: 17800.9779039	total: 392ms	remaining: 586ms
500:	learn: 16748.5601501	total: 452ms	remaining: 451ms
600:	learn: 15841.7439678	total: 522ms	remaining: 347ms
700:	learn: 15038.9391300	total: 593ms	remaining: 253ms
800:	learn: 14394.7101044	total: 655ms	remaining: 163ms
900:	learn: 13820.6543679	total: 712ms	remaining: 78.3ms
999:	learn: 13314.3778143	total: 767ms	remaining: 0us
MAE using CatBoost: 16533.41


In [None]:
from sklearn.metrics import r2_score

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2:.4f}")               
print(f"Accuracy: {r2 * 100:.2f}%")        


R² Score: 0.9154
Accuracy: 91.54%


In [6]:
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("./train.csv")

# Drop columns with too many missing values
df = df.drop(columns=df.columns[df.isnull().mean() > 0.3])

# Select important features
features = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF',
    'FullBath', 'YearBuilt', 'YearRemodAdd', 'Fireplaces', 'LotArea',
    '1stFlrSF', 'TotRmsAbvGrd', 'MasVnrArea', 'BsmtFinSF1',
    'Neighborhood', 'ExterQual', 'KitchenQual', 'GarageType', 'GarageFinish',
    'SalePrice'
]
df = df[[col for col in features if col in df.columns]]

# Encode categorical features using LabelEncoder
cat_cols = df.select_dtypes(include='object').columns
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Split data into features and target
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert certain features to category dtype for LightGBM
categorical_features = ['Neighborhood', 'ExterQual', 'KitchenQual', 'GarageType', 'GarageFinish']
for col in categorical_features:
    if col in X_train.columns:
        X_train[col] = X_train[col].astype('category')
        X_test[col] = X_test[col].astype('category')

# Define LightGBM model with tuned parameters
model = LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.01,
    max_depth=12,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Fit model with early stopping and evaluation logging
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='mae',
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(period=100)
    ]
)

# Predict and evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output results
print(f"MAE: {mae:.2f}")
print(f"R² Score (Accuracy): {r2 * 100:.2f}%")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1927
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 19
[LightGBM] [Info] Start training from score 181441.541952
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 29570.1	valid_0's l2: 2.32327e+09
[200]	valid_0's l1: 20004.1	valid_0's l2: 1.25436e+09
[300]	valid_0's l1: 17864.6	valid_0's l2: 1.00849e+09
[400]	valid_0's l1: 17460.9	valid_0's l2: 9.34286e+08
[500]	valid_0's l1: 17385.2	valid_0's l2: 9.17192e+08
[600]	valid_0's l1: 17325.7	valid_0's l2: 9.07772e+08
[700]	valid_0's l1: 17376.2	valid_0's l2: 9.09543e+08
Early stopping, best iteration is:
[608]	valid_0's l1: 17317.1	valid_0's l2: 9.0639e+08
MAE: 17317.09
R² Score (Accuracy): 88.18%
