In [16]:
import pandas as pd
from sklearn import linear_model 
import numpy as np
import statistics

TRAIN_PATH = '../../res/ftr/base_data_train.csv'
EVALUATION_PATH = '../../res/ftr/base_data_evaluation.csv'

pd.set_option('display.max_columns', 30)

training_set = pd.read_csv(TRAIN_PATH)
evaluation_set = pd.read_csv(EVALUATION_PATH)

training_set.drop(columns=['Unnamed: 0'], inplace=True)
evaluation_set.drop(columns=['Unnamed: 0'], inplace=True)

evaluation_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos
0,4941,29.0,3.0,,4.0,300.0,,,15.906,0,0,0,0,0
1,51775,,1.0,1.0,1.0,67.0,67.0,113851.0,16.732,0,0,0,0,0
2,115253,0.0,2.0,1.0,2.0,87.0,100.0,23620.0,16.585,0,0,0,0,1
3,299321,2.0,2.0,2.0,2.0,86.0,86.0,129347.0,16.527,0,0,0,0,0
4,173570,10.0,2.0,1.0,1.0,80.0,76.0,57125.0,15.932,0,0,0,1,1


In [17]:
def load_features(train_df, evaluation_df, features_list):
    for feature in features_list:
        test_ftr = pd.read_csv('../../res/ftr/'+feature+'_train.csv')
        evaluation_ftr = pd.read_csv('../../res/ftr/'+feature+'_evaluation.csv')
    
        train_df = train_df.merge(test_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
        evaluation_df = evaluation_df.merge(evaluation_ftr, on='id', how='inner').drop(columns=['Unnamed: 0'])
    
    return (train_df, evaluation_df)

(training_set, evaluation_set) = load_features(training_set, evaluation_set, ['surface_features', 'murder_rate_of_entity'])

training_set.describe()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,metrostotalesporhabitacion,metroscubiertosporhabitacion,metroscubiertossobretotales,metrosdescubiertos,murder_rate_of_entity
count,240000.0,196445.0,217529.0,202235.0,213779.0,222600.0,188533.0,211379.0,240000.0,240000.0,240000.0,240000.0,240000.0,240000.0,240000.0,170403.0,209785.0,171133.0,171133.0,239845.0
mean,149969.382092,8.116114,2.902326,1.546874,2.132417,174.016774,176.765145,2423468.0,16.527429,0.062475,0.055092,0.087383,0.444142,0.396533,2530838.0,59.089277,59.588282,1.0828,-4.444011,14.436264
std,86634.579744,9.55383,0.896894,0.853507,0.912546,98.15295,94.427328,10567940.0,0.515122,0.242017,0.22816,0.282397,0.496871,0.489179,2152552.0,31.454123,29.97814,0.465917,66.646722,11.66227
min,1.0,0.0,1.0,0.0,1.0,15.0,15.0,22.0,15.34,0.0,0.0,0.0,0.0,0.0,310000.0,2.0,2.0,0.0375,-411.0,2.0
25%,74930.75,0.0,2.0,1.0,1.0,90.0,102.0,24890.0,16.142,0.0,0.0,0.0,0.0,0.0,952772.5,36.666667,36.666667,0.857143,-35.0,9.666667
50%,149875.5,5.0,3.0,2.0,2.0,153.0,155.0,56383.0,16.633,0.0,0.0,0.0,0.0,0.0,1850000.0,51.0,53.0,1.0,0.0,10.333333
75%,225016.5,10.0,3.0,2.0,3.0,240.0,238.0,87838.0,16.981,0.0,0.0,0.0,1.0,1.0,3390000.0,74.25,75.666667,1.238095,21.0,15.666667
max,299999.0,80.0,10.0,3.0,4.0,439.0,439.0,50004000.0,17.166,1.0,1.0,1.0,1.0,1.0,12525000.0,435.0,438.0,25.4,416.0,71.833333


In [32]:
median_cols = ['antiguedad', 'habitaciones', 'banos', 'metroscubiertosporhabitacion', 'metroscubiertossobretotales']
tipo_median_cols = ['metroscubiertos', 'metrostotales']
ciudad_median_cols = ['idzona', 'murder_rate_of_entity']

median_by_tipodepropiedad = {}
mean_by_ciudad = {}

median_by_tipodepropiedad = {}
median_by_ciudad = {}

training_set['garages'] = training_set['garages'].fillna(0)
training_set['metroscubiertos'] = training_set['metroscubiertos'].fillna(training_set['metroscubiertos'].median())
training_set['metrostotales'] = training_set['metrostotales'].fillna(training_set['metroscubiertos'].median())
training_set['idzona'] = training_set['idzona'].fillna(0)
training_set['metrostotalesporhabitacion'] = training_set['metrostotalesporhabitacion'].fillna(training_set['metrostotalesporhabitacion'].median())
training_set['metrosdescubiertos'] = training_set['metrosdescubiertos'].fillna(training_set['metrosdescubiertos'].median())
training_set['murder_rate_of_entity'] = training_set['murder_rate_of_entity'].fillna(training_set['murder_rate_of_entity'].median())

evaluation_set['garages'] = evaluation_set['garages'].fillna(0)
evaluation_set['metroscubiertos'] = evaluation_set['metroscubiertos'].fillna(evaluation_set['metroscubiertos'].median())
evaluation_set['metrostotales'] = evaluation_set['metrostotales'].fillna(evaluation_set['metroscubiertos'].median())
evaluation_set['idzona'] = evaluation_set['idzona'].fillna(0)
evaluation_set['metrostotalesporhabitacion'] = evaluation_set['metrostotalesporhabitacion'].fillna(evaluation_set['metrostotalesporhabitacion'].median())
evaluation_set['metrosdescubiertos'] = evaluation_set['metrosdescubiertos'].fillna(evaluation_set['metrosdescubiertos'].median())
evaluation_set['murder_rate_of_entity'] = evaluation_set['murder_rate_of_entity'].fillna(evaluation_set['murder_rate_of_entity'].median())


for column in median_cols:
    median = training_set[column].median()
    training_set[column] = training_set[column].fillna(median)

training_set.head()

Unnamed: 0,id,antiguedad,habitaciones,garages,banos,metroscubiertos,metrostotales,idzona,fecha,gimnasio,usosmultiples,piscina,escuelascercanas,centroscomercialescercanos,precio,metrostotalesporhabitacion,metroscubiertosporhabitacion,metroscubiertossobretotales,metrosdescubiertos,murder_rate_of_entity
0,254099,5.0,2.0,1.0,2.0,80.0,80.0,23533.0,16.67,0,0,0,0,0,2273000,40.0,40.0,1.0,0.0,9.666667
1,53461,10.0,3.0,2.0,2.0,268.0,180.0,24514.0,15.884,0,0,0,1,1,3600000,60.0,89.333333,1.488889,-88.0,9.666667
2,247984,5.0,3.0,2.0,2.0,144.0,166.0,48551.0,16.725,0,0,0,0,0,1200000,55.333333,48.0,0.86747,22.0,13.5
3,209067,1.0,2.0,1.0,1.0,63.0,67.0,53666.0,15.408,0,0,0,1,1,650000,33.5,31.5,0.940299,4.0,10.333333
4,185997,10.0,2.0,1.0,1.0,95.0,95.0,47835.0,16.959,0,0,0,0,0,1150000,47.5,47.5,1.0,0.0,13.5


In [33]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240000 entries, 0 to 239999
Data columns (total 20 columns):
id                              240000 non-null int64
antiguedad                      240000 non-null float64
habitaciones                    240000 non-null float64
garages                         240000 non-null float64
banos                           240000 non-null float64
metroscubiertos                 240000 non-null float64
metrostotales                   240000 non-null float64
idzona                          240000 non-null float64
fecha                           240000 non-null float64
gimnasio                        240000 non-null int64
usosmultiples                   240000 non-null int64
piscina                         240000 non-null int64
escuelascercanas                240000 non-null int64
centroscomercialescercanos      240000 non-null int64
precio                          240000 non-null int64
metrostotalesporhabitacion      240000 non-null float64
metrosc

In [None]:
training_set_X = training_set.drop(columns=['precio']).values
training_set_Y = training_set['precio'].values

regressor = linear_model.LinearRegression(normalize = True)
regressor.fit(training_set_X, training_set_Y)