In [1]:
import pandas as pd
import numpy as np

types = {'id': int, 'titulo': str, 'descripcion': str, 'tipodepropiedad': str, 'direccion': str,
        'ciudad': str, 'provincia': str, 'antiguedad': np.float32, 'habitaciones': np.float32,
        'garages':np.float32, 'idzona': np.float32, 'lat': np.float32, 'lng': np.float32,
        'gimnasio': int, 'usosmultiples': int, 'piscina': int, 'escuelascercanas': int,
        'centroscomercialescercanos': int, 'precio':int}

pd.set_option('display.max_columns', 30)

training_data = pd.read_csv('../../data/TP2/train.csv', dtype = types, usecols=['id','antiguedad','provincia','fecha'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id','antiguedad','provincia','fecha'])

training_data.head()

Unnamed: 0,id,provincia,antiguedad,fecha
0,254099,Distrito Federal,,2015-08-23 00:00:00
1,53461,Distrito Federal,10.0,2013-06-28 00:00:00
2,247984,Jalisco,5.0,2015-10-17 00:00:00
3,209067,Edo. de México,1.0,2012-03-09 00:00:00
4,185997,Jalisco,10.0,2016-06-07 00:00:00


In [2]:
med_prov_ant = training_data.groupby('provincia').agg({'antiguedad':'median'}).reset_index().rename(columns={'antiguedad':'mean_ant'})
mean_ant = training_data.antiguedad.mean()

training_data = pd.merge(training_data, med_prov_ant, on='provincia', how='left')
evaluation_data = pd.merge(evaluation_data, med_prov_ant, on='provincia', how='left')

training_data['mean_ant'].fillna(mean_ant,inplace=True)
evaluation_data['mean_ant'].fillna(mean_ant,inplace=True)

training_data['antiguedad'].fillna(training_data['mean_ant'],inplace=True)
evaluation_data['antiguedad'].fillna((evaluation_data['mean_ant']),inplace=True)


training_data.head()

Unnamed: 0,id,provincia,antiguedad,fecha,mean_ant
0,254099,Distrito Federal,10.0,2015-08-23 00:00:00,10.0
1,53461,Distrito Federal,10.0,2013-06-28 00:00:00,10.0
2,247984,Jalisco,5.0,2015-10-17 00:00:00,5.0
3,209067,Edo. de México,1.0,2012-03-09 00:00:00,10.0
4,185997,Jalisco,10.0,2016-06-07 00:00:00,5.0


In [3]:
training_data.fecha.isnull().sum()

0

In [4]:
training_data["fecha"] = pd.to_datetime(training_data['fecha'])
evaluation_data["fecha"] = pd.to_datetime(evaluation_data['fecha'])

training_data["antiguedad"] = training_data["antiguedad"] + (2016 - training_data["fecha"].dt.year)
evaluation_data["antiguedad"] = evaluation_data["antiguedad"] + (2016 - training_data["fecha"].dt.year)

In [5]:
training_data['antig']=pd.cut(training_data.antiguedad,[0,1,5,15,45,85], labels=False, precision = 1, include_lowest=True)
evaluation_data['antig']=pd.cut(evaluation_data.antiguedad,[0,1,5,15,45,85], precision = 1, include_lowest=True)

In [6]:
training_data.head()

Unnamed: 0,id,provincia,antiguedad,fecha,mean_ant,antig
0,254099,Distrito Federal,11.0,2015-08-23,10.0,2
1,53461,Distrito Federal,13.0,2013-06-28,10.0,2
2,247984,Jalisco,6.0,2015-10-17,5.0,2
3,209067,Edo. de México,5.0,2012-03-09,10.0,1
4,185997,Jalisco,10.0,2016-06-07,5.0,2


In [7]:
zonas_economicas = {
    'noroeste':['Baja California Norte', 'Baja California Sur', 'Chihuahua', 'Durango','Sinaloa','Sonora'],
    'noreste':[ 'Coahuila','Nuevo León','Tamaulipas'],
    'occidente':['Colima','Jalisco','Michoacán','Nayarit'],
    'oriente':['Hidalgo','Puebla','Tlaxcala','Veracruz'],
    'centronorte':['Aguascalientes','Guanajuato','Querétaro','San luis Potosí','Zacatecas'],
    'centrosur':['Edo. de México','Distrito Federal','Morelos'],
    'suroeste':['Chiapas','Guerrero','Oaxaca'],
    'sureste':['Campeche','Quintana Roo','Tabasco','Yucatán']
}

In [8]:
def zone(data): 
    for zona in zonas_economicas:
        if data[1] in zonas_economicas[zona]: return zona
    return 'sindatos'

In [9]:
training_data['zona'] = training_data.apply(zone, axis=1) 
evaluation_data['zona'] = evaluation_data.apply(zone, axis=1)

In [10]:
training_data.drop(columns=['provincia','antiguedad','fecha','mean_ant'],inplace=True)
evaluation_data.drop(columns=['provincia','antiguedad','fecha','mean_ant'],inplace=True)

In [11]:
#from sklearn.feature_extraction import FeatureHasher

#training_data['antig'] = training_data['antig'].apply(lambda x: str(x))
#evaluation_data['antig'] = evaluation_data['antig'].apply(lambda x: str(x))

#feature_hasher = FeatureHasher(n_features = 40, input_type='string')

In [12]:
training_data['antig'] = training_data['antig'].astype(str)
evaluation_data['antig'] = evaluation_data['antig'].astype(str)
#training_data['zona'] = training_data['zona'].astype(str)
#evaluation_data['zona'] = evaluation_data['zona'].apply(lambda x: str(x))

training_data['antig_zone'] = (training_data['zona']+training_data['antig'])
evaluation_data['antig_zone'] = (evaluation_data['zona']+training_data['antig'])

#training_data.drop(columns=['antig','zona'],inplace=True)
#evaluation_data.drop(columns=['antig','zona'],inplace=True)

training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 240000 entries, 0 to 239999
Data columns (total 4 columns):
id            240000 non-null int64
antig         240000 non-null object
zona          240000 non-null object
antig_zone    240000 non-null object
dtypes: int64(1), object(3)
memory usage: 19.2+ MB


In [13]:
training_zones = training_data[['antig_zone']]#.to_numpy()
evaluation_zones = evaluation_data[['antig_zone']]#.to_numpy()
all_zones = np.unique( np.concatenate( (training_zones, evaluation_zones), axis=None) )

In [14]:
all_zones = all_zones#.astype(str)
all_zones.dtype

dtype('O')

In [15]:
for zone in all_zones:
    training_data['zone_'+zone] = training_data['antig_zone'].apply(lambda x: 1 if x == zone else 0)
    evaluation_data['zone_'+zone] = evaluation_data['antig_zone'].apply(lambda x: 1 if x == zone else 0)


In [16]:
training_data.drop(columns=['antig_zone'],inplace=True)
evaluation_data.drop(columns=['antig_zone'],inplace=True)

training_data.drop(columns={'antig','zona'}, inplace=True)
evaluation_data.drop(columns={'antig','zona'}, inplace=True)

Unnamed: 0,id,antig,zona,zone_centronorte0,zone_centronorte1,zone_centronorte2,zone_centronorte3,zone_centronorte4,zone_centrosur0,zone_centrosur1,zone_centrosur2,zone_centrosur3,zone_centrosur4,zone_noreste0,zone_noreste1,...,zone_oriente4,zone_sindatos0,zone_sindatos1,zone_sindatos2,zone_sindatos3,zone_sureste0,zone_sureste1,zone_sureste2,zone_sureste3,zone_sureste4,zone_suroeste0,zone_suroeste1,zone_suroeste2,zone_suroeste3,zone_suroeste4
0,254099,2,centrosur,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,53461,2,centrosur,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,247984,2,occidente,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,209067,1,centrosur,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,185997,2,occidente,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
evaluation_data.columns == training_data.columns

Unnamed: 0,id,zone_centronorte0,zone_centronorte1,zone_centronorte2,zone_centronorte3,zone_centronorte4,zone_centrosur0,zone_centrosur1,zone_centrosur2,zone_centrosur3,zone_centrosur4,zone_noreste0,zone_noreste1,zone_noreste2,zone_noreste3,...,zone_oriente4,zone_sindatos0,zone_sindatos1,zone_sindatos2,zone_sindatos3,zone_sureste0,zone_sureste1,zone_sureste2,zone_sureste3,zone_sureste4,zone_suroeste0,zone_suroeste1,zone_suroeste2,zone_suroeste3,zone_suroeste4
0,4941,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,51775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,115253,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,299321,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,173570,0,0,0,0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
training_data.to_csv('../../res/ftr/encoded_zone_antiguedad_train.csv')
evaluation_data.to_csv('../../res/ftr/encoded_zone_antiguedad_evaluation.csv')

In [20]:
#falta castear el tipo de las columnas