In [32]:
import pandas as pd

In [33]:
df = pd.read_csv("../data/subdataset_Biro_Daniel.csv")

In [34]:
df = df.drop(columns=["PoolQC", "Fence", "MiscFeature"])

In [35]:
target = "SalePrice"
X = df.drop(columns=[target])
y = df[target]

In [36]:
numerical_columns = X.select_dtypes(exclude=['object']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.tree import DecisionTreeRegressor

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
  ("num", numeric_transformer, numerical_columns),
  ("cat", categorical_transformer, categorical_columns)
])

pipe = Pipeline([
  ('preprocessor', preprocessor),
  ("model", LinearRegression())
])


In [38]:

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)


In [39]:

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Absolute Error: 49584.63596890808
Mean Squared Error: 4656940638.920395
R^2 Score: 0.2795362973658534
