In [259]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, make_scorer


In [426]:
# cleaning data
dataset = "test"
df = pd.read_csv(f'data/{dataset}.csv', index_col='Id')
df['YearsSince2006'] = df['YrSold'] + ((df['MoSold'] - 1) / 12) - 2006
df=df.drop(['GarageType',	'GarageYrBlt',	'GarageFinish', 'GarageQual',	'GarageCond', 'MasVnrType','BsmtFinType1','BsmtFinSF1',
            'BsmtFinSF2','BsmtFinType2','YrSold','MoSold'],axis=1)
df['LotFrontage'] = df['LotFrontage'].fillna(0)
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['MSSubClass'] = df['MSSubClass'].astype(str)
df[df.select_dtypes(include=['float']).columns]=df[df.select_dtypes(include=['float']).columns].fillna(0)
df[df.select_dtypes(include='int').columns] = df.select_dtypes(include='int').astype(float)
print(df.isnull().sum().sum())  # Check if there are any null values left 
print(len(df.select_dtypes(include='object').columns.tolist()))
print(df.dtypes.unique().tolist())
print(df['GarageCars'].unique())
print(df['GarageCars'].dtype)
df.to_csv(f'data/{dataset}_clean.csv', index=False)
del df

6260
37
[dtype('O'), dtype('float64')]
[1. 2. 3. 0. 4. 5.]
float64


In [222]:
#Linear model


In [427]:
df = pd.read_csv('data/train_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)
X_predict = pd.read_csv('data/test_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)

In [430]:
object_columns_train = df.select_dtypes(include='object').columns.tolist()
print(len(object_columns_train))
object_columns_test = X_predict.select_dtypes(include='object').columns.tolist()
print(len(object_columns_test))
obj_to_float_list=X_predict[list(set(object_columns_test) - set(object_columns_train))].dtypes
print(X_predict['GarageCars'].unique())
print(X_predict.isnull().sum().sum())  # Check if there are any null values left

37
37
[1. 2. 3. 0. 4. 5.]
0


In [432]:
# reading the cleaned data
df = pd.read_csv('data/train_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)
X_predict = pd.read_csv('data/test_clean.csv',dtype={'MSSubClass': 'object'},keep_default_na=False)

# explanatory and response variables
X = df.drop(['SalePrice'], axis=1)
y = df['SalePrice']
#groups = df["Neighborhood"]  # the grouping variable

categorical_cols = X.select_dtypes(include=["object", "string"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["float64"]).columns.tolist()
# scaling and one-hot encoding
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore", drop="first")
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])
#("regressor", LinearRegression())
def mse_log(y_true, y_pred):
    return mean_squared_error(np.log(y_true), np.log(y_pred))
mse_log_scorer = make_scorer(mse_log, greater_is_better=False)
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", ElasticNet(random_state=42))
])


# Define hyperparameter grid to search over
param_grid = {
    "regressor__alpha": [0.01, 0.1, 1.0, 10],    # Regularization strength
    "regressor__l1_ratio": [0.1, 0.5, 0.9],      # Mix between L1 and L2 penalty
    "regressor__max_iter": [1000, 5000]           # Max iterations for convergence
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,               # 5-fold cross-validation
    scoring=mse_log_scorer,       # Use R² score for evaluation
    n_jobs=-1,          # Use all CPU cores
    verbose=2           # Print progress
)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [433]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.4s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.1, regressor__max_iter=5000;



[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.4s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.5s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.6s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.5s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=1000; total



[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s




[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.2s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.3s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   0.2s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.3s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=1000; total time=   0.3s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   0.3s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   0.2s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=1000; total tim



[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=0.1, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   0.3s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.5, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.9, regressor__max_iter=1000; total tim



[CV] END regressor__alpha=1.0, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.1, regressor__max_iter=5000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s
[CV] END regressor__alpha=10, regressor__l1_ratio=0.5, regressor__max_iter=1000; total time=   0.1s



[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   1.5s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   1.5s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   1.7s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   1.8s
[CV] END regressor__alpha=0.01, regressor__l1_ratio=0.9, regressor__max_iter=5000; total time=   1.8s




0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'regressor__alpha': [0.01, 0.1, ...], 'regressor__l1_ratio': [0.1, 0.5, ...], 'regressor__max_iter': [1000, 5000]}"
,scoring,make_scorer(m...hod='predict')
,n_jobs,-1
,refit,True
,cv,5
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,42


In [434]:
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)
test_score = grid_search.score(X_test, y_test)
print("Test set score:", test_score)
y_test_pred = grid_search.predict(X_test)
print("Test R²:", r2_score(y_test, y_test_pred))

Best parameters found: {'regressor__alpha': 1.0, 'regressor__l1_ratio': 0.5, 'regressor__max_iter': 1000}
Best cross-validation score: -0.02153808765366849
Test set score: -0.03397980210878481
Test R²: 0.7474892282367749




In [442]:
y_predict=grid_search.predict(X_predict)
print(y_predict)
ids=np.array(range(1461, 1461 + len(y_predict)))
df_result = pd.DataFrame({
    "Id": ids,            # your Id vector
    "SalePrice": y_predict  # your predicted values
})
df_result.to_csv('submission.csv', index=False)

[118549.6316675  168908.13368011 181620.60883696 ... 187801.05709886
 109950.84754135 230097.77714717]




In [441]:
df_result.to_csv('submission.csv', index=False)