In [104]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv("Real estate.csv")

In [105]:
y = data.iloc[:,-1]
data.drop(['Y house price of unit area', 'No'], axis=1, inplace=True)
rng = 42

In [106]:
data.head()

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude
0,2012.917,32.0,84.87882,10,24.98298,121.54024
1,2012.917,19.5,306.5947,9,24.98034,121.53951
2,2013.583,13.3,561.9845,5,24.98746,121.54391
3,2013.5,13.3,561.9845,5,24.98746,121.54391
4,2012.833,5.0,390.5684,5,24.97937,121.54245


In [107]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('______')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [108]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(data,y,test_size=0.2, random_state=rng)

In [109]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(data=scaler.transform(X_train),columns = X_train.columns,index=X_train.index)
X_test = pd.DataFrame(data=scaler.transform(X_test),columns = X_test.columns,index=X_test.index)

In [110]:
from sklearn.ensemble import IsolationForest
X_train_1 = X_train.copy()
y_train_1 = pd.DataFrame(y_train.copy())
clf = IsolationForest(n_estimators=100, random_state=rng, contamination=0.1)
clf.fit(X_train_1)
X_train_1['anomaly'] = clf.predict(X_train_1)
y_train_1['anomaly'] = X_train_1['anomaly']
X_train_1 = X_train_1.loc[X_train_1['anomaly'] == 1]
y_train_1 = y_train_1.loc[y_train_1['anomaly'] == 1]



In [111]:
X_train_1.drop(['anomaly'], axis=1, inplace=True)
y_train_1 = y_train_1['Y house price of unit area']
y_train_1

192     42.7
234     23.9
5       32.1
45      38.3
245     40.8
       ...  
71      40.8
106     47.1
270    117.5
348     53.7
102     54.4
Name: Y house price of unit area, Length: 298, dtype: float64

In [112]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators=100, random_state=rng)
rf_reg.fit(X_train_1, y_train_1)

test_pred = rf_reg.predict(X_test)
train_pred = rf_reg.predict(X_train_1)

print('Testing set evaluation:\n______')
print_evaluate(y_test, test_pred)
print('==*******==')
print('Training set evaluation:\n_____')
print_evaluate(y_train_1, train_pred)

Testing set evaluation:
______
MAE: 3.843339156626506
MSE: 31.543081753490604
RMSE: 5.616322796411421
R2 Square 0.8119746674406556
______
==*******==
Training set evaluation:
_____
MAE: 1.9011203020134229
MSE: 9.47842959174274
RMSE: 3.0787058306604647
R2 Square 0.944328935169913
______
