In [None]:

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

df = pd.read_csv("housing.csv")
df.head()
df["rooms_per_household"] = df["total_rooms"] / df["households"]
df["bedrooms_per_room"] = df["total_bedrooms"] / df["total_rooms"]
df["population_per_household"] = df["population"] / df["households"]


num_features = ["longitude", "latitude", "housing_median_age", 
                "total_rooms", "total_bedrooms", "population", 
                "households", "median_income", "rooms_per_household", 
                "bedrooms_per_room", "population_per_household"]
cat_features = ["ocean_proximity"]


numeric = Pipeline([("imputer", SimpleImputer(strategy= "median")), 
                    ("scaler", StandardScaler())])

categorical = Pipeline([("imputer", SimpleImputer(strategy = "most_frequent")), 
                        ("ohe", OneHotEncoder(handle_unknown= "ignore" ))])


preprossesor = ColumnTransformer([("num", numeric, num_features), 
                                  ("cat", categorical, cat_features)])

X = df[num_features + cat_features]
y = df["median_house_value"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 42)

clf = Pipeline([("preprossesor", preprossesor), ("model", LinearRegression())])

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)


print("mean_squared_error:", mean_squared_error(y_test, y_pred))
print("mean_absolute_error:", mean_absolute_error(y_test, y_pred))
print("r2_score:", r2_score(y_test, y_pred))










mean_squared_error: 4778547424.025956
mean_absolute_error: 49645.49244453642
r2_score: 0.635339233523819


In [None]:
candidates = {
    "linreg": LinearRegression(),
    "tree": DecisionTreeRegressor(random_state=42),
    "rf": RandomForestRegressor(n_estimators=200, random_state=42)
}

for name, base_model in candidates.items():
    model = Pipeline([("prep", preprossesor), ("model", base_model)])
    scores = cross_val_score(model, X, y, cv=5, scoring="r2")
    print(name, scores.mean(), "±", scores.std())

linreg 0.572558432796427 ± 0.0724588226502527


