In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

In [2]:
impute_and_scale = False

if impute_and_scale:
    np.random.seed(0)
    df = pd.read_csv('processed_dataset.csv')
    df = df.drop(['Unnamed: 0'], axis=1)

    # split into train and test
    test_ratio = 0.2
    train_indices = np.sort(np.random.choice(df.shape[0], int((1-test_ratio)*df.shape[0]), replace=False))
    test_indices = np.setdiff1d(np.arange(df.shape[0]), train_indices)

    #train_indices = train_indices[:int(0.05*len(train_indices))]
    #test_indices = test_indices[:int(0.05*len(test_indices))]

    y = df['price']
    X = df.drop('price', axis=1)

    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]

    imputer = KNNImputer(n_neighbors=5)
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    imputed_array = np.zeros((df.shape[0], df.shape[1]))
    imputed_array[train_indices, :] = np.concatenate((X_train, y_train.values.reshape(-1,1)), axis=1)
    imputed_array[test_indices, :] = np.concatenate((X_test, y_test.values.reshape(-1,1)), axis=1)
    column_names = list(df.columns[:8]) + list(df.columns[9:]) + [df.columns[8]]
    
    df = pd.DataFrame(imputed_array, columns=column_names)

    y = df['price']
    X = df.drop('price', axis=1)

    X_train = X.iloc[train_indices]
    y_train = y.iloc[train_indices]
    X_test = X.iloc[test_indices]
    y_test = y.iloc[test_indices]

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    scaled_array = np.zeros((df.shape[0], df.shape[1]))
    scaled_array[train_indices, :] = np.concatenate((X_train, y_train.values.reshape(-1,1)), axis=1)
    scaled_array[test_indices, :] = np.concatenate((X_test, y_test.values.reshape(-1,1)), axis=1)

    df = pd.DataFrame(scaled_array, columns=column_names)

    df.to_csv('imputed_scaled_dataset_0.csv')
else:
    df = pd.read_csv('imputed_scaled_dataset_0.csv')
    df = df.drop(['Unnamed: 0'], axis=1)

In [3]:
y = df['price']
X = df.drop('price', axis=1)

# split into train and test
np.random.seed(0)
test_ratio = 0.2
train_indices = np.sort(np.random.choice(df.shape[0], int((1-test_ratio)*df.shape[0]), replace=False))
test_indices = np.setdiff1d(np.arange(df.shape[0]), train_indices)

X_train = X.iloc[train_indices]
y_train = y.iloc[train_indices]
X_test = X.iloc[test_indices]
y_test = y.iloc[test_indices]

In [4]:
for i, column_name in enumerate(df.columns):
    values, counts = np.unique(df[column_name].astype(str), return_counts=True)
    if len(values) < 10 and counts[0]/len(df[column_name]) > 0.5:
        print(column_name, counts/len(df[column_name]))    

homeStatus [0.60768358 0.15091755 0.24139888]
resoFactsStats/hasAttachedGarage [0.96547095 0.03452905]
resoFactsStats/hasAttachedProperty [0.94858701 0.05141299]
resoFactsStats/hasGarage [0.79527526 0.20472474]
resoFactsStats/hasOpenParking [0.96053032 0.03946968]
resoFactsStats/hasView [0.93149547 0.06850453]
schools/2/isAssigned [1.]
description_exists [0.51065626 0.48934374]


In [29]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# recursive feature elimination
if 1:
    regressor = DecisionTreeRegressor(min_samples_leaf=int(2.5e-4*train_indices.size))
else:
    regressor = RandomForestRegressor(min_samples_leaf=int(2.5e-4*train_indices.size))
rfe = RFE(estimator=regressor, n_features_to_select=0.5, step=10)
rfe.fit(X_train, y_train)
print(X.columns[rfe.support_])


Index(['address/zipcode', 'bathrooms', 'homeStatus', 'latitude', 'livingArea',
       'longitude', 'resoFactsStats/bathrooms', 'resoFactsStats/bathroomsFull',
       'resoFactsStats/bedrooms', 'resoFactsStats/cityRegion',
       'resoFactsStats/homeType', 'resoFactsStats/livingArea',
       'resoFactsStats/lotSize', 'resoFactsStats/taxAnnualAmount',
       'resoFactsStats/taxAssessedValue', 'schools/0/rating', 'schools/0/size',
       'schools/1/distance', 'schools/1/size', 'schools/1/studentsPerTeacher',
       'schools/2/rating', 'schools/2/size', 'yearBuilt', 'zpid',
       'number_of_photos', 'description_lengths'],
      dtype='object')


In [31]:
from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator=regressor, min_features_to_select=0.5, step=10)
rfecv.fit(X_train, y_train)
print(X.columns[rfecv.support_])

Index(['bathrooms', 'homeStatus', 'latitude', 'livingArea', 'longitude',
       'resoFactsStats/bathrooms', 'resoFactsStats/bathroomsFull',
       'resoFactsStats/bedrooms', 'resoFactsStats/cityRegion',
       'resoFactsStats/homeType', 'resoFactsStats/livingArea',
       'resoFactsStats/lotSize', 'resoFactsStats/taxAnnualAmount',
       'resoFactsStats/taxAssessedValue', 'schools/0/rating', 'schools/0/size',
       'schools/1/studentsPerTeacher', 'schools/2/rating', 'schools/2/size',
       'zpid', 'number_of_photos', 'description_lengths'],
      dtype='object')


In [106]:
from sklearn.feature_selection import SequentialFeatureSelector

sfs = SequentialFeatureSelector(regressor, n_features_to_select=0.5, direction="forward", cv=5)
sfs.fit(X_train, y_train)
print(X.columns[sfs.support_])

Index(['bedrooms', 'homeStatus', 'latitude', 'propertyTaxRate',
       'resoFactsStats/bathrooms', 'resoFactsStats/cityRegion',
       'resoFactsStats/garageSpaces', 'resoFactsStats/hasAttachedGarage',
       'resoFactsStats/hasAttachedProperty', 'resoFactsStats/hasCooling',
       'resoFactsStats/hasGarage', 'resoFactsStats/hasHeating',
       'resoFactsStats/hasOpenParking', 'resoFactsStats/hasView',
       'resoFactsStats/homeType', 'resoFactsStats/parking',
       'resoFactsStats/taxAssessedValue', 'schools/0/level',
       'schools/0/rating', 'schools/0/size', 'schools/1/studentsPerTeacher',
       'schools/2/isAssigned', 'schools/2/level',
       'schools/2/studentsPerTeacher', 'zpid', 'url_exists'],
      dtype='object')


In [32]:
regressor.fit(X_train, y_train)
y_predictions = regressor.predict(X_test)
print("All features RMSE: ", np.sqrt(np.mean((y_predictions - y_test)**2)))

All features RMSE:  2501240.518529755


In [33]:
regressor.fit(X_train.iloc[:, rfe.support_], y_train)
y_predictions_rfe = regressor.predict(X_test.iloc[:, rfe.support_])
print("Selected features rfe RMSE: ", (np.sqrt(np.mean((y_predictions_rfe - y_test)**2))))

Selected features rfe RMSE:  2513599.9599027713


In [34]:
regressor.fit(X_train.iloc[:, rfecv.support_], y_train)
y_predictions_rfecv = regressor.predict(X_test.iloc[:, rfecv.support_])
print("Selected features rfecv RMSE: ", (np.sqrt(np.mean((y_predictions_rfecv - y_test)**2))))

Selected features rfecv RMSE:  2502485.2092605038


In [109]:
regressor.fit(X_train.iloc[:, sfs.support_], y_train)
y_predictions_sfs = regressor.predict(X_test.iloc[:, sfs.support_])
print("Selected features sfs RMSE: ", (np.sqrt(np.mean((y_predictions_sfs - y_test)**2))))

Selected features sfs RMSE:  2507722.951050765


In [110]:
print("Features in both selections: ", X.columns[(rfe.support_) & (sfs.support_)])

Features in both selections:  Index(['homeStatus', 'latitude', 'resoFactsStats/bathrooms',
       'resoFactsStats/cityRegion', 'resoFactsStats/homeType',
       'resoFactsStats/taxAssessedValue', 'schools/0/rating', 'schools/0/size',
       'schools/1/studentsPerTeacher', 'zpid'],
      dtype='object')


In [111]:
print("Features chosen by sfs but not by rfe: ", X.columns[~(rfe.support_) & (sfs.support_)])

Features chosen by sfs but not by rfe:  Index(['bedrooms', 'propertyTaxRate', 'resoFactsStats/garageSpaces',
       'resoFactsStats/hasAttachedGarage',
       'resoFactsStats/hasAttachedProperty', 'resoFactsStats/hasCooling',
       'resoFactsStats/hasGarage', 'resoFactsStats/hasHeating',
       'resoFactsStats/hasOpenParking', 'resoFactsStats/hasView',
       'resoFactsStats/parking', 'schools/0/level', 'schools/2/isAssigned',
       'schools/2/level', 'schools/2/studentsPerTeacher', 'url_exists'],
      dtype='object')


In [112]:
print("Features chosen by rfe but not by sfs: ", X.columns[(rfe.support_) & ~(sfs.support_)])

Features chosen by rfe but not by sfs:  Index(['address/zipcode', 'bathrooms', 'livingArea', 'longitude',
       'resoFactsStats/bathroomsFull', 'resoFactsStats/bedrooms',
       'resoFactsStats/livingArea', 'resoFactsStats/lotSize',
       'resoFactsStats/taxAnnualAmount', 'schools/0/distance',
       'schools/1/distance', 'schools/1/size', 'schools/2/rating',
       'schools/2/size', 'number_of_photos', 'description_lengths'],
      dtype='object')


In [24]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_predictions = lr.predict(X_test)
print("Linear Regression RMSE: ", np.sqrt(np.mean((lr_predictions - y_test)**2)))

Linear Regression RMSE:  2976830.0663935035


In [28]:
lr = LinearRegression()
lr.fit(X_train.iloc[:, rfe.support_], y_train)
lr_predictions_rfe = lr.predict(X_test.iloc[:, rfe.support_])
print("Linear Regression rfe RMSE: ", np.sqrt(np.mean((lr_predictions_rfe - y_test)**2)))

Linear Regression rfe RMSE:  3126545.0220086235


In [139]:
lr = LinearRegression()
lr.fit(X_train.iloc[:, sfs.support_], y_train)
lr_predictions_sfs = lr.predict(X_test.iloc[:, sfs.support_])
print("Linear Regression sfs RMSE: ", np.sqrt(np.mean((lr_predictions_sfs - y_test)**2)))

Linear Regression sfs RMSE:  3106835.355218363
