In [62]:
import pandas as pd
import os

In [29]:
df = pd.read_csv("../raw_data/ar_properties.csv")

In [30]:
df.columns

Index(['id', 'ad_type', 'start_date', 'end_date', 'created_on', 'lat', 'lon',
       'l1', 'l2', 'l3', 'l4', 'l5', 'l6', 'rooms', 'bedrooms', 'bathrooms',
       'surface_total', 'surface_covered', 'price', 'currency', 'price_period',
       'title', 'description', 'property_type', 'operation_type'],
      dtype='object')

In [31]:
df = df.rename(columns={
    "lat":"latitude",
    "lon":"longitude",
    "l1":"country", 
    "l2":"province", 
    "l3":"city",
    "l4":"district",
    "l5":"estate"
})

In [32]:
df.columns

Index(['id', 'ad_type', 'start_date', 'end_date', 'created_on', 'latitude',
       'longitude', 'country', 'province', 'city', 'district', 'estate', 'l6',
       'rooms', 'bedrooms', 'bathrooms', 'surface_total', 'surface_covered',
       'price', 'currency', 'price_period', 'title', 'description',
       'property_type', 'operation_type'],
      dtype='object')

In [33]:
df.l6.value_counts()

Series([], Name: count, dtype: int64)

In [34]:
df = df.drop(columns=["l6"])

In [38]:
df.ad_type.value_counts()

AttributeError: 'DataFrame' object has no attribute 'ad_type'

In [37]:
df = df.drop(columns=["ad_type"])

KeyError: "['ad_type'] not found in axis"

In [39]:
df.operation_type.value_counts()

operation_type
Venta                782122
Alquiler             183927
Alquiler temporal     33951
Name: count, dtype: int64

In [40]:
df["operation_type"] = df["operation_type"].replace(regex={
    "Venta":"Sale",
    "Alquiler temporal":"Temporary Rent",
    "Alquiler":"Rent"})

In [41]:
df.operation_type.value_counts()

operation_type
Sale              782122
Rent              183927
Temporary Rent     33951
Name: count, dtype: int64

In [42]:
df.property_type.value_counts()

property_type
Departamento       373376
Otro               239720
Casa               190023
Lote                83792
PH                  35217
Local comercial     34493
Oficina             26365
Cochera              8908
Depósito             6502
Casa de campo        1604
Name: count, dtype: int64

In [43]:
df["property_type"] = df["property_type"].replace(regex={
    "Departamento":"Apartment",
    "Local comercial":"Commercial Premises",
    "Oficina":"Office",
    "Depósito":"Warehouse",
    "Otro": "Other",
    "Casa de campo":"Village House",
    "Casa":"House",
    "Cochera":"Garage",
    "Lote":"Terrain",
    "PH": "Horizontal Property"
    })

In [44]:
df.property_type.value_counts()

property_type
Apartment              373376
Other                  239720
House                  190023
Terrain                 83792
Horizontal Property     35217
Commercial Premises     34493
Office                  26365
Garage                   8908
Warehouse                6502
Village House            1604
Name: count, dtype: int64

In [45]:
cond_rent = df["operation_type"] == "Rent"
cond_sale = df["operation_type"] == "Sale"
cond_curr = df["currency"] == "USD"
df[cond_sale & cond_curr].price.mean(), \
    df[cond_rent & cond_curr].price.mean()

(231140.34107096842, 7379.891555198346)

In [46]:
df.price_period.value_counts()

price_period
Mensual    362978
Semanal        29
Diario          8
Name: count, dtype: int64

In [47]:
df["price_period"] = df["price_period"].replace(regex={
    "Mensual":"Monthly",
    "Semanal":"Weekly",
    "Diario":"Daily"})

In [48]:
df.price_period.value_counts()

price_period
Monthly    362978
Weekly         29
Daily           8
Name: count, dtype: int64

In [49]:
df.head()

Unnamed: 0,id,start_date,end_date,created_on,latitude,longitude,country,province,city,district,...,bathrooms,surface_total,surface_covered,price,currency,price_period,title,description,property_type,operation_type
0,wdQ5hWhv8P14T7Sh9g4QCg==,2020-12-25,9999-12-31,2020-12-25,-32.716652,-68.642692,Argentina,Mendoza,,,...,,350.0,350.0,,,,Excelentes Lotes Sobre Ruta 34,Corredor Responsable: VICTOR E. MONTIVERO - C....,Terrain,Sale
1,nnMBYZ4RMRY+vm753EtA+g==,2020-12-25,9999-12-31,2020-12-25,-24.797723,-65.467514,Argentina,Salta,,,...,,1541.0,1541.0,,,Monthly,TERRENO + VENTA + JARDINES DE SAN LORENZO +150...,Corredor Responsable: Pablo Castañeda - C.U.C....,Terrain,Sale
2,+dnVA1K6JxzL1zAjOEQ1pA==,2020-12-25,2020-12-29,2020-12-25,-34.919373,-58.020591,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,La Plata,...,,1000.0,1000.0,,,Monthly,Lote en Venta de 1000 m2 en La Plata,Corredor Responsable: Rico Sebastián - Martill...,Terrain,Sale
3,dLHXKN5/sRZpm9Yk0yI2nA==,2020-12-25,2020-12-29,2020-12-25,-34.919455,-58.024807,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,La Plata,...,,1000.0,1000.0,,,Monthly,Lote en Venta de 1000 m2 en La Plata,Corredor Responsable: Rico Sebastián - Martill...,Terrain,Sale
4,wtw/k887EPipd37UYHKb1Q==,2020-12-25,9999-12-31,2020-12-25,-34.364924,-58.783143,Argentina,Bs.As. G.B.A. Zona Norte,Escobar,Belén de Escobar,...,,18164.0,18164.0,,,Monthly,PANAMERICANA 47300,Nave principal 66 x 90 m: 6005 m2 cubiertos...,Other,Sale


In [50]:
df.country.value_counts()

country
Argentina         985725
Uruguay            13471
Estados Unidos       705
Brasil                99
Name: count, dtype: int64

In [51]:
columns_to_drop = df[df["country"] == "Estados Unidos"]
df = df.drop(columns_to_drop.index)

In [53]:
df.country.value_counts()

country
Argentina    985725
Uruguay       13471
Brasil           99
Name: count, dtype: int64

In [54]:
df.dtypes

id                  object
start_date          object
end_date            object
created_on          object
latitude           float64
longitude          float64
country             object
province            object
city                object
district            object
estate              object
rooms              float64
bedrooms           float64
bathrooms          float64
surface_total      float64
surface_covered    float64
price              float64
currency            object
price_period        object
title               object
description         object
property_type       object
operation_type      object
dtype: object

In [22]:
# df["start_date"] = df["start_date"].astype("datetime64[s]")
# df["end_date"] = df["end_date"].astype("datetime64[s]")
# df["created_on"] = df["created_on"].astype("datetime64[s]")
# df["country"] = df["country"].astype("string")
# df["province"] = df["province"].astype("string")
# df["city"] = df["city"].astype("string")
# df["country"] = df["country"].astype("string")
# df["district"] = df["district"].astype("string")
# df["estate"] = df["estate"].astype("string")
# df["currency"] = df["currency"].astype("string")
# df["price_period"] = df["price_period"].astype("string")
# df["title"] = df["title"].astype("string")
# df["description"] = df["description"].astype("string")
# df["property_type"] = df["property_type"].astype("string")
# df["operation_type"] = df["operation_type"].astype("string")

In [55]:
df.dtypes

id                  object
start_date          object
end_date            object
created_on          object
latitude           float64
longitude          float64
country             object
province            object
city                object
district            object
estate              object
rooms              float64
bedrooms           float64
bathrooms          float64
surface_total      float64
surface_covered    float64
price              float64
currency            object
price_period        object
title               object
description         object
property_type       object
operation_type      object
dtype: object

In [56]:
df.head()

Unnamed: 0,id,start_date,end_date,created_on,latitude,longitude,country,province,city,district,...,bathrooms,surface_total,surface_covered,price,currency,price_period,title,description,property_type,operation_type
0,wdQ5hWhv8P14T7Sh9g4QCg==,2020-12-25,9999-12-31,2020-12-25,-32.716652,-68.642692,Argentina,Mendoza,,,...,,350.0,350.0,,,,Excelentes Lotes Sobre Ruta 34,Corredor Responsable: VICTOR E. MONTIVERO - C....,Terrain,Sale
1,nnMBYZ4RMRY+vm753EtA+g==,2020-12-25,9999-12-31,2020-12-25,-24.797723,-65.467514,Argentina,Salta,,,...,,1541.0,1541.0,,,Monthly,TERRENO + VENTA + JARDINES DE SAN LORENZO +150...,Corredor Responsable: Pablo Castañeda - C.U.C....,Terrain,Sale
2,+dnVA1K6JxzL1zAjOEQ1pA==,2020-12-25,2020-12-29,2020-12-25,-34.919373,-58.020591,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,La Plata,...,,1000.0,1000.0,,,Monthly,Lote en Venta de 1000 m2 en La Plata,Corredor Responsable: Rico Sebastián - Martill...,Terrain,Sale
3,dLHXKN5/sRZpm9Yk0yI2nA==,2020-12-25,2020-12-29,2020-12-25,-34.919455,-58.024807,Argentina,Bs.As. G.B.A. Zona Sur,La Plata,La Plata,...,,1000.0,1000.0,,,Monthly,Lote en Venta de 1000 m2 en La Plata,Corredor Responsable: Rico Sebastián - Martill...,Terrain,Sale
4,wtw/k887EPipd37UYHKb1Q==,2020-12-25,9999-12-31,2020-12-25,-34.364924,-58.783143,Argentina,Bs.As. G.B.A. Zona Norte,Escobar,Belén de Escobar,...,,18164.0,18164.0,,,Monthly,PANAMERICANA 47300,Nave principal 66 x 90 m: 6005 m2 cubiertos...,Other,Sale


In [64]:
if not os.path.exists("csv"):
    os.mkdir("csv")

In [61]:
for _, i in enumerate(range(0,1000001,5000)):
    df[i:i+5000].to_csv(f"csv/preprocessed_{_}.csv")