In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

In [36]:
data_path = './data/sale-estates-data-set.csv'

In [37]:
df = pd.read_csv(data_path)

In [38]:
prize_mean = df.prize.mean()
prize_std = df.prize.std()

area_mean = df.Area.mean()
area_std = df.Area.std()


In [39]:
df['prize'] = df['prize'].apply(lambda x : (x - prize_mean) / prize_std)
df['Area'] = df['Area'].apply(lambda x : (x - area_mean) / area_std)

In [40]:
df.head()

Unnamed: 0,Area,prize,Garage,Environments,Bedrooms,"location_Abasto, Capital Federal","location_Agronomía, Capital Federal","location_Almagro, Capital Federal","location_Balvanera, Capital Federal","location_Barracas, Capital Federal",...,"location_Villa Lugano, Capital Federal","location_Villa Luro, Capital Federal","location_Villa Ortuzar, Capital Federal","location_Villa Pueyrredón, Capital Federal","location_Villa Real, Capital Federal","location_Villa Riachuelo, Capital Federal","location_Villa Santa Rita, Capital Federal","location_Villa Soldati, Capital Federal","location_Villa Urquiza, Capital Federal","location_Villa del Parque, Capital Federal"
0,-0.489299,0.050145,1.0,5.0,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.375861,-0.140227,3.0,5.0,4.0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,-0.46229,-0.372904,0.0,4.0,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.024743,-0.457514,0.0,4.0,3.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.454692,0.9597,0.0,5.0,7.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
df.columns

Index(['Area', 'prize', 'Garage', 'Environments', 'Bedrooms',
       'location_Abasto, Capital Federal',
       'location_Agronomía, Capital Federal',
       'location_Almagro, Capital Federal',
       'location_Balvanera, Capital Federal',
       'location_Barracas, Capital Federal',
       'location_Barrio Norte, Capital Federal',
       'location_Barrio Parque, Palermo', 'location_Belgrano C, Belgrano',
       'location_Belgrano Chico, Belgrano', 'location_Belgrano R, Belgrano',
       'location_Belgrano, Capital Federal', 'location_Boedo, Capital Federal',
       'location_Botánico, Palermo', 'location_Caballito Norte, Caballito',
       'location_Caballito Sur, Caballito',
       'location_Caballito, Capital Federal',
       'location_Centro / Microcentro, Capital Federal',
       'location_Chacarita, Capital Federal',
       'location_Cid Campeador, Caballito',
       'location_Coghlan, Capital Federal',
       'location_Colegiales, Capital Federal',
       'location_Congreso, Ca

In [42]:
X = df.loc[:, df.columns != 'prize'].values
Y = df.loc[:,'prize'].values.reshape(-1, 1)

In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [44]:
X.shape

(4879, 80)

In [45]:
Y_train.shape

(3903, 1)

In [46]:
mlp_regressor = MLPRegressor(
    hidden_layer_sizes=[78, 78, 78, 78, 78, 78, 78, 78],
    activation='relu',
    max_iter=500,
    solver='sgd',
    batch_size=32,
    learning_rate_init = 0.005,
    learning_rate = 'adaptive',
    shuffle=True,
    early_stopping=True,
    tol = 1e-7,
    n_iter_no_change=10)

In [49]:
mlp_regressor.fit(X_train, Y_train.ravel())

MLPRegressor(batch_size=32, early_stopping=True,
             hidden_layer_sizes=[78, 78, 78, 78, 78, 78, 78, 78],
             learning_rate='adaptive', learning_rate_init=0.005, max_iter=500,
             solver='sgd', tol=1e-07)

In [50]:
mlp_loss = mlp_regressor.loss_

mlp_train_score = mlp_regressor.score(X_train, Y_train)

mlp_test_score = mlp_regressor.score(X_test, Y_test)

print('MLP Loss: ' + str(mlp_loss))
print('MLP Score Train: ' + str(mlp_train_score))
print('MLP Score Test: ' + str(mlp_test_score))

MLP Loss: 0.09319970945503328
MLP Score Train: 0.8028740390181036
MLP Score Test: 0.7803179694935978


In [51]:
random_forest = RandomForestRegressor(max_depth=10)

In [52]:
random_forest.fit(X_train, Y_train.ravel())

RandomForestRegressor(max_depth=10)

In [53]:
random_forest_test_score = random_forest.score(X_test, Y_test)
random_forest_train_score = random_forest.score(X_train, Y_train)

print('RF Train score: ' + str(random_forest_train_score))
print('RF Test score: ' + str(random_forest_test_score))

RF Train score: 0.8590782619063839
RF Test score: 0.744561253903864


In [71]:
neigh = KNeighborsRegressor(n_neighbors=10)

In [72]:
neigh.fit(X_train, Y_train.ravel())

KNeighborsRegressor(n_neighbors=10)

In [73]:
neigh_test_score = neigh.score(X_test, Y_test)
neigh_train_score = neigh.score(X_train, Y_train)

print('Neigh Train score: ' + str(neigh_train_score))
print('Neigh Test score: ' + str(neigh_test_score))

Neigh Train score: 0.56759664294515
Neigh Test score: 0.44819572901994875


In [74]:
print('MLP Train Score: ' + str(mlp_train_score))
print('RF Train score: ' + str(random_forest_train_score))
print('Neigh Train score: ' + str(neigh_train_score))
print('\n')
print('MLP TestScore: ' + str(mlp_test_score))
print('RF Test score: ' + str(random_forest_test_score))
print('Neigh Test score: ' + str(neigh_test_score))

MLP Train Score: 0.8028740390181036
RF Train score: 0.8590782619063839
Neigh Train score: 0.56759664294515


MLP TestScore: 0.7803179694935978
RF Test score: 0.744561253903864
Neigh Test score: 0.44819572901994875


In [75]:
columns = df.columns.values


In [79]:
columns[columns != 'prize'][4:]

array(['location_Abasto, Capital Federal',
       'location_Agronomía, Capital Federal',
       'location_Almagro, Capital Federal',
       'location_Balvanera, Capital Federal',
       'location_Barracas, Capital Federal',
       'location_Barrio Norte, Capital Federal',
       'location_Barrio Parque, Palermo', 'location_Belgrano C, Belgrano',
       'location_Belgrano Chico, Belgrano',
       'location_Belgrano R, Belgrano',
       'location_Belgrano, Capital Federal',
       'location_Boedo, Capital Federal', 'location_Botánico, Palermo',
       'location_Caballito Norte, Caballito',
       'location_Caballito Sur, Caballito',
       'location_Caballito, Capital Federal',
       'location_Centro / Microcentro, Capital Federal',
       'location_Chacarita, Capital Federal',
       'location_Cid Campeador, Caballito',
       'location_Coghlan, Capital Federal',
       'location_Colegiales, Capital Federal',
       'location_Congreso, Capital Federal',
       'location_Constitución, C

In [111]:
city = 'Palermo'
locality = 'Capital Federal'

In [112]:
city_index = np.where(columns[columns != 'prize'][4:] == 'location_' + city + ', ' + locality)[0][0]
print(city_index)

47


In [116]:
area = 290
garage = 2
environments = 4
bedrooms = 4

area_normalized = (area - area_mean) / area_std

location = np.zeros(76)
location[city_index] = 1

X_pred = np.concatenate((np.array([area_normalized, garage, environments, bedrooms]), location)).reshape(-1, 1)

In [117]:
Y_mlp_pred = mlp_regressor.predict(X_pred.T)
Y_random_forest_pred = random_forest.predict(X_pred.T)
Y_neigh_pred = neigh.predict(X_pred.T)

In [118]:
print("MLP: ", str(Y_mlp_pred * prize_std + prize_mean))
print("RF: ", str(Y_random_forest_pred * prize_std + prize_mean))
print("NEIGH: ", str(Y_neigh_pred * prize_std + prize_mean))



MLP:  [745155.4340491]
RF:  [629965.61230624]
NEIGH:  [864500.]
