In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection._search import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor

In [2]:
def scale_X(df):
    norm = StandardScaler().fit(df)
    norm.transform(df)
    df = pd.DataFrame(columns=df.columns, data=norm.transform(df))
    return df


def get_X_y(data_dir="houses_train.csv", cols_order=None, to_numpy=False):
    df = pd.read_csv(data_dir)
    df.info()
    df.head()

    y = df.price
    df = df.drop(["url", "price"], axis=1)
    df["Unnamed: 0"].replace(
        to_replace=df["Unnamed: 0"].to_numpy(), value=1, inplace=True
    )
    X = pd.get_dummies(df)

    if cols_order is not None:
        for col in cols_order:
            if col not in X.columns:
                X.insert(0, col, np.zeros(X.shape[0]), True)
        X = X[cols_order]

    X = scale_X(X)

    if to_numpy:
        X = X.to_numpy()
        y = y.to_numpy()

    return X, y


X, y = get_X_y()
cols_order = X.columns
X = X.to_numpy()
y = y.to_numpy()

X, y = shuffle(X, y, random_state=78)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5001 entries, 0 to 5000
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      5001 non-null   int64  
 1   price           5001 non-null   float64
 2   condition       5001 non-null   object 
 3   district        5001 non-null   object 
 4   max_floor       5001 non-null   int64  
 5   street          5001 non-null   object 
 6   num_rooms       5001 non-null   int64  
 7   region          5001 non-null   object 
 8   area            5001 non-null   float64
 9   url             5001 non-null   object 
 10  num_bathrooms   5001 non-null   int64  
 11  building_type   5001 non-null   object 
 12  floor           5001 non-null   int64  
 13  ceiling_height  5001 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 547.1+ KB


In [3]:
lasso = Lasso()
params = {
    "alpha": [
        # 1e-5,
        # 1e-4,
        # 1e-3,
        # 1e-2,
        # 0.1,
        # 0.2,
        # 0.3,
        # 0.4,
        # 0.5,
        # 1,
        # 2,
        # 3,
        # 4,
        # 5,
        # 10,
        # 20,
        # 30,
        # 40,
        # 50,
        60,
        70,
        80,
        90,
        100,
        110,
        120,
        130,
        140,
        # 150,
        # 200,
        # 300,
        # 400,
        # 500,
    ]
}
Regressor = GridSearchCV(lasso, params, scoring="neg_mean_squared_error", cv=5)


while True:
    Regressor.fit(X, y)
    chosen_lasso_alpha = Regressor.best_params_["alpha"]
    print("best parameter: ", chosen_lasso_alpha)

    lasso = Lasso(alpha=chosen_lasso_alpha)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=78
    )
    lasso.fit(X_train, y_train)
    y_pred = lasso.predict(X_test)

    print(
        "RMSE before dropping features with 0 bettas: ",
        np.sqrt(mean_squared_error(y_test, y_pred)),
    )
    print(
        "R2 score before dropping features with 0 bettas: ", lasso.score(X_test, y_test)
    )

    coefs = lasso.coef_

    indices_to_drop = []
    for ind in range(len(coefs)):
        if coefs[ind] == 0 and ind != 0:
            indices_to_drop.append(ind)

    if not indices_to_drop:
        break

    X_copy = np.delete(X, indices_to_drop, 1)
    cols_order = np.delete(cols_order, indices_to_drop)
    X = X_copy

best parameter:  120
RMSE before dropping features with 0 bettas:  26171.27556645697
R2 score before dropping features with 0 bettas:  0.7739462648214974
best parameter:  100
RMSE before dropping features with 0 bettas:  26141.877793940042
R2 score before dropping features with 0 bettas:  0.7744538246374683
best parameter:  100
RMSE before dropping features with 0 bettas:  26141.877793940042
R2 score before dropping features with 0 bettas:  0.7744538246374683


In [4]:
ridge = Ridge(alpha=110)
params = {
    "alpha": [
        # 1e-5,
        # 1e-4,
        # 1e-3,
        # 1e-2,
        # 0.1,
        # 0.2,
        # 0.3,
        # 0.4,
        # 0.5,
        # 0.6,
        # 0.7,
        # 0.8,
        # 0.9,
        # 1,
        # 1.1,
        # 1.2,
        # 1.3,
        # 1.4,
        # 1.5,
        # 1.6,
        # 1.7,
        # 1.8,
        # 1.9,
        # 2,
        # 3,
        # 4,
        # 5,
        # 10,
        # 20,
        # 30,
        40,
        50,
        60,
        70,
        80,
        90,
        100,
        110,
        120,
        # 130,
        # 140,
        # 150,
        # 200,
        # 300,
        # 400,
        # 500,
    ]
}

# Regressor = GridSearchCV(ridge, params, scoring="neg_mean_squared_error", cv=5)
# Regressor.fit(X, y)
# print("best parameter: ", Regressor.best_params_)
# chosen_ridge_alpha = Regressor.best_params_["alpha"]
# with scaler and X_copy alpha = 60

In [5]:
elastic_net = ElasticNet(alpha=0.01, l1_ratio=1e-5, max_iter=2000)
params = {
    "alpha": [
        # 0.0001,
        # 1e-5,
        # 1e-4,
        # 1e-3,
        1e-2,
        # 1e-1,
        # 1,
        # 2,
        # 3,
        # 4,
        # 5,
        # 10,
        # 20,
        # 30,
        # 40,
        # 50,
        # 100,
    ],
    "l1_ratio": [
        # 1e-9,
        # 1e-8,
        # 1e-7,
        # 1e-6,
        1e-5,
        # 1e-4,
        # 1e-3,
        # 1e-2,
        # 0.1,
        # 0.2,
        # 0.3,
        # 0.4,
        # 0.5,
        # 0.6,
        # 0.7,
        # 0.8,
        # 0.9,
    ],
}

# Regressor = GridSearchCV(elastic_net, params, scoring="neg_mean_squared_error", cv=5)
# Regressor.fit(X, y)
# print(Regressor.best_params_)

In [6]:
KNNreg = KNeighborsRegressor(n_neighbors=4)
params = {
    "n_neighbors": [
        1,
        2,
        3,
        4,
        5,
        6,
        7,
    ]
}

# Regressor = GridSearchCV(KNNreg, params, scoring="neg_mean_squared_error", cv=5)
# Regressor.fit(X, y)
# print(Regressor.best_params_)

In [7]:
# model = LinearRegression()
# model = elastic_net
# model = Ridge(alpha=chosen_ridge_alpha)
model = Lasso(alpha=chosen_lasso_alpha)
# model = KNNreg

In [8]:
cv = KFold(n_splits=10, shuffle=True, random_state=78)

r2_scores = []
RMSE_scores = []
for train_indices, test_indices in cv.split(X):
    X_train, X_test, y_train, y_test = (
        X[train_indices],
        X[test_indices],
        y[train_indices],
        y[test_indices],
    )
    model.fit(X_train, y_train)
    scr = model.score(X_test, y_test)
    r2_scores.append(scr)

    RMSE_scores.append(np.sqrt(mean_squared_error(y_test, model.predict(X_test))))
print(np.mean(r2_scores))
print(np.mean(RMSE_scores))

0.7868213619579099
23958.804669296795


In [10]:
# 1. preprocessing of final_test_df (scaling, one hot encoding ...)
# 2. make sure that columns and their order in train and test are the same
# 2. return predictions
X_TEST, y_TEST = get_X_y(
    data_dir="houses_test.csv", cols_order=cols_order, to_numpy=True
)
model.fit(X, y)
y_PRED = model.predict(X_TEST)
print(f"RMSE TEST: {np.sqrt(mean_squared_error(y_TEST, y_PRED))}")
print(f"R2_score TEST: {r2_score(y_TEST, y_PRED)}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234 entries, 0 to 1233
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      1234 non-null   int64  
 1   price           1234 non-null   float64
 2   condition       1234 non-null   object 
 3   district        1234 non-null   object 
 4   max_floor       1234 non-null   int64  
 5   street          1234 non-null   object 
 6   num_rooms       1234 non-null   int64  
 7   region          1234 non-null   object 
 8   area            1234 non-null   float64
 9   url             1234 non-null   object 
 10  num_bathrooms   1234 non-null   int64  
 11  building_type   1234 non-null   object 
 12  floor           1234 non-null   int64  
 13  ceiling_height  1234 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 135.1+ KB


  X.insert(0, col, np.zeros(X.shape[0]), True)
  X.insert(0, col, np.zeros(X.shape[0]), True)
  X.insert(0, col, np.zeros(X.shape[0]), True)


RMSE TEST: 25141.147369879818
R2_score TEST: 0.7663058249341261


### Dear TA, please change data_dir to the houses train data directory in the get_X_y function of the latest cell, then run all the cells of this file