In [10]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer


In [11]:
# cleaning data
dataset = "test"
df = pd.read_csv(f'data/{dataset}.csv', index_col='Id')
df['YearsSince2006'] = df['YrSold'] + ((df['MoSold'] - 1) / 12) - 2006
df=df.drop(['GarageType',	'GarageYrBlt',	'GarageFinish', 'GarageQual',	'GarageCond', 'MasVnrType','BsmtFinType1','BsmtFinSF1',
            'BsmtFinSF2','BsmtFinType2','YrSold','MoSold'],axis=1)
df['LotFrontage'] = df['LotFrontage'].fillna(0)
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['MSSubClass'] = df['MSSubClass'].astype(str)
df[df.select_dtypes(include=['float']).columns]=df[df.select_dtypes(include=['float']).columns].fillna(0)
df[df.select_dtypes(include='int').columns] = df.select_dtypes(include='int').astype(float)
print(df.isnull().sum().sum())  # Check if there are any null values left 
print(len(df.select_dtypes(include='object').columns.tolist()))
print(df.dtypes.unique().tolist())
print(df['GarageCars'].unique())
print(df['GarageCars'].dtype)
df.to_csv(f'data/{dataset}_clean.csv', index=False)
del df

6260
37
[dtype('O'), dtype('float64')]
[1. 2. 3. 0. 4. 5.]
float64


In [12]:
#Linear model


In [13]:
df = pd.read_csv('data/train_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)
X_predict = pd.read_csv('data/test_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)

In [14]:
object_columns_train = df.select_dtypes(include='object').columns.tolist()
print(len(object_columns_train))
object_columns_test = X_predict.select_dtypes(include='object').columns.tolist()
print(len(object_columns_test))
obj_to_float_list=X_predict[list(set(object_columns_test) - set(object_columns_train))].dtypes
print(X_predict['GarageCars'].unique())
print(X_predict.isnull().sum().sum())  # Check if there are any null values left

37
37
[1. 2. 3. 0. 4. 5.]
0


In [15]:
#elastic net
# reading the cleaned data
df = pd.read_csv('data/train_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)
X_predict = pd.read_csv('data/test_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)

# explanatory and response variables
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']
#groups = df["Neighborhood"]  # the grouping variable

categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["float64"]).columns.tolist()
# scaling and one-hot encoding
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", drop="first")
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])
#("regressor", LinearRegression())
def mse_log(y_true, y_pred):
    return mean_squared_error(np.log(y_true), np.log(y_pred))
mse_log_scorer = make_scorer(mse_log, greater_is_better=False)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ElasticNet(random_state=42))
])


# Define hyperparameter grid to search over
param_grid = {
    "regressor__alpha": [0.01, 0.1, 1.0, 10],    # Regularization strength
    "regressor__l1_ratio": [0.1, 0.5, 0.9],      # Mix between L1 and L2 penalty
    "regressor__max_iter": [1000, 5000]           # Max iterations for convergence
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,               # 5-fold cross-validation
    scoring=mse_log_scorer,       # Use R² score for evaluation
    n_jobs=-1,          # Use all CPU cores
    verbose=2           # Print progress
)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
#XGBoost
from xgboost import XGBRegressor

df = pd.read_csv('data/train_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)
X_predict = pd.read_csv('data/test_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)

# explanatory and response variables
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']
#groups = df["Neighborhood"]  # the grouping variable

categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["float64"]).columns.tolist()
# scaling and one-hot encoding
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", drop="first")
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])
#("regressor", LinearRegression())
def mse_log(y_true, y_pred):
    return mean_squared_error(np.log(y_true), np.log(y_pred))
mse_log_scorer = make_scorer(mse_log, greater_is_better=False)

xgb_model = XGBRegressor(
    n_estimators=500,        # Number of boosting rounds (trees)
    max_depth=6,             # Max depth of each tree (controls model complexity)
    learning_rate=0.05,      # Step size shrinkage used in update to prevent overfitting
    subsample=0.8,           # Fraction of samples to use for each tree (helps generalization)
    colsample_bytree=0.8,    # Fraction of features used per tree
    reg_alpha=0.1,           # L1 regularization term on weights
    reg_lambda=1.0,          # L2 regularization term on weights
    random_state=42,         # Ensures reproducible results
    n_jobs=-1,               # Use all CPU cores for training
    verbosity=2              # Controls level of messages during training
)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", xgb_model)
])


# Define hyperparameter grid to search over
param_grid = {
    "regressor__n_estimators": [100, 300],
    "regressor__max_depth": [3, 6],
    "regressor__learning_rate": [0.001, 0.01, 0.1],
    "regressor__reg_alpha": [0.01, 0.1, 1.0],  # L1 regularization
    "regressor__reg_lambda": [0.01, 0.1, 1.0]  # L2 regularization
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,               # 5-fold cross-validation
    scoring=mse_log_scorer,       # Use R² score for evaluation
    n_jobs=-1,          # Use all CPU cores
    verbose=2           # Print progress
)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.preprocessing._encoders")
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[21:54:03] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (876, 226, 51852).
[21:54:03] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (876, 223, 51016).
[CV] END regressor__colsample_bytree=0.8, regressor__learning_rate=0.001, regressor__max_depth=3, regressor__n_estimators=100, regressor__reg_alpha=0.01, regressor__reg_lambda=0.01, regressor__subsample=0.8; total time=   0.9s
[21:54:03] INFO: /workspace/src/data/iterative_dmatrix.cc:53: Finished constructing the `IterativeDMatrix`: (876, 225, 51874).
[CV] END regressor__colsample_bytree=0.8, regressor__learning_rate=0.001, regressor__max_depth=3, regressor__n_estimators=100, regressor__reg_alpha=0.01, regressor__reg_lambda=0.01, regressor__subsample=1.0; total time=   0.8s
[CV] END regressor__colsample_bytree=0.8, regressor__learning_rate=0.001, regressor__max_depth=3, regressor__n_estimators=100, regressor__reg_alpha=0.01, regressor__

KeyboardInterrupt: 

In [None]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
test_score = grid_search.score(X_test, y_test)
print("Test set score:", test_score)
y_test_pred = grid_search.predict(X_test)
print("Test R²:", r2_score(y_test, y_test_pred))



Best parameters found: {'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.5, 'regressor__max_iter': 1000}
Best cross-validation score: -0.02390061206937285
Test set score: -0.0217329024084346


Test R²: 0.8858763381169877




In [None]:
y_predict=grid_search.predict(X_predict)
print(y_predict)
ids=np.array(range(1461, 1461 + len(y_predict)))
df_result = pd.DataFrame({
    "Id": ids,            # your Id vector
    "SalePrice": y_predict  # your predicted values
})
df_result.to_csv('submission.csv', index=False)

[104726.13851925 148326.30355727 173209.02832914 ... 175656.65574782
 102512.79604457 234504.7986698 ]




In [None]:
df_result.to_csv('submission.csv', index=False)