## Importing Libs

In [1]:
import pandas as pd
import numpy as np

## Loading DataSet

In [2]:
df = pd.read_csv('../DataSets/OLX_Housing.csv')

In [3]:
df.head()

Unnamed: 0,Title,Status,# Rooms,Area,# Car Spots,Condominium Fee,City,Neighborhood,Price,Description,URL
0,House_AD,For rent,3.0,86m²,2.0,R$ 0,Recife,Boa Viagem,R$ 4.000.,"House in Boa Viagem, Recife",https://pe.olx.com.br/grande-recife/imoveis/ap...
1,House_AD,,,600m²,,R$ 0,Camaragibe,Aldeia dos Camarás,R$ 220.000.,"House in Aldeia dos Camarás, Camaragibe",https://pe.olx.com.br/grande-recife/terrenos/t...
2,House_AD,For sale,3.0,97m²,2.0,,Recife,Dois Irmãos,R$ 777.000.,"House in Dois Irmãos, Recife",https://pe.olx.com.br/grande-recife/imoveis/vm...
3,House_AD,For sale,4.0,134m²,3.0,R$ 655,Jaboatão dos Guararapes,Candeias,R$ 940.000.,"House in Candeias, Jaboatão dos Guararapes",https://pe.olx.com.br/grande-recife/imoveis/ap...
4,House_AD,For sale,4.0,124m²,2.0,R$ 0,Recife,Graças,R$ 1.080.980.,"House in Graças, Recife",https://pe.olx.com.br/grande-recife/imoveis/ap...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1734 entries, 0 to 1733
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Title            1734 non-null   object 
 1   Status           1530 non-null   object 
 2   # Rooms          1356 non-null   float64
 3   Area             1677 non-null   object 
 4   # Car Spots      641 non-null    float64
 5   Condominium Fee  1378 non-null   object 
 6   City             1734 non-null   object 
 7   Neighborhood     1734 non-null   object 
 8   Price            1734 non-null   object 
 9   Description      1734 non-null   object 
 10  URL              1734 non-null   object 
dtypes: float64(2), object(9)
memory usage: 162.6+ KB


# Discarding unuseful data

For this analysis, some columns are not useful

In [5]:
df = df.drop(columns=['Title','Description','URL'])

# Adapting string columns

In [6]:
# very useful :)
def isnan(val):
    if val != val:
        return True

In [7]:
# needed, the function isdigit() does not properly work with ²
def remove_sqrd_symbol(text):
    if isnan(text):
        return np.nan
    return text.replace('²','')

#### Since there are no non-integer numbers in the dataset, we can adapt and re-utilize this function

In [8]:
# gets text as input and returns an int composed by all digits found in order
def extract_numbers(text): 
    
    if isnan(text):
        return np.nan
    
    try:
        return int(''.join(x for x in text if x.isdigit()))
    except:
        return np.nan

#### Converting those to number columns

In [9]:
df['Area'] = df['Area'].apply(remove_sqrd_symbol).apply(extract_numbers)

In [10]:
df['Condominium Fee'] = df['Condominium Fee'].apply(extract_numbers)

In [11]:
df['Price'] = df['Price'].apply(extract_numbers)

#### Renaming columns to represent the unit in which they are registered

In [12]:
df = df.rename(columns={'Area':'Area (m²)',
                        'Condominium Fee':'Condominium Fee (BRL)',
                        'Price':'Price (BRL)'})

In [13]:
df.columns

Index(['Status', '# Rooms', 'Area (m²)', '# Car Spots',
       'Condominium Fee (BRL)', 'City', 'Neighborhood', 'Price (BRL)'],
      dtype='object')

### Using the dataset in this state for the EDA will bring us more meaningful plots

In [14]:
df.head()

Unnamed: 0,Status,# Rooms,Area (m²),# Car Spots,Condominium Fee (BRL),City,Neighborhood,Price (BRL)
0,For rent,3.0,86.0,2.0,0.0,Recife,Boa Viagem,4000.0
1,,,600.0,,0.0,Camaragibe,Aldeia dos Camarás,220000.0
2,For sale,3.0,97.0,2.0,,Recife,Dois Irmãos,777000.0
3,For sale,4.0,134.0,3.0,655.0,Jaboatão dos Guararapes,Candeias,940000.0
4,For sale,4.0,124.0,2.0,0.0,Recife,Graças,1080980.0


In [15]:
df.to_csv('../DataSets/EDA_OLX_housing.csv',index_label=False)

---------------------------------------

# After EDA

We will feed the Machine Learning model with the adequate data, therefore, we need to do some extra steps in pre_processing

In [16]:
df=pd.read_csv('../DataSets/EDA_OLX_housing_AfterProf.csv')

#### Converting string columns to int, so we can interact through ML algorithms

In [17]:
# this function takes a dataset as input and creates a dictionary to convert all string columns to int
# there are better ways to do that if you want to derive information from the ordering of the numbers, but, in this case
# we will stick to a more simple approach
def create_dictioraries(df):
    conversion_dictionaries = {}
    
    for col in df.columns:
        datatype = df[col].dtypes

        if datatype == 'object':
            
            col_dict ={np.nan:np.nan}
            
            i=0
            for val in df[col].unique():
                if not isnan(val):
                    col_dict[val] = i
                    i+=1
                    
            conversion_dictionaries[col] = col_dict

    return conversion_dictionaries

In [18]:
conv_dicts = create_dictioraries(df)

In [19]:
# applying the conversion
for col in list(conv_dicts.keys()):
    df[col] = df[col].replace(conv_dicts[col])

In [20]:
# deleting price per meter, since we are predicting price
del df['Price per m²']

In [21]:
df.head()

Unnamed: 0,# Rooms,Area (m²),# Car Spots,Condominium Fee (BRL),City,Neighborhood,Price (BRL)
0,4.0,134.0,3.0,655.0,0,0,940000.0
1,4.0,124.0,2.0,0.0,1,1,1080980.0
2,3.0,100.0,0.0,670.0,0,2,340000.0
3,,34.0,0.0,0.0,2,3,199000.0
4,2.0,50.0,0.0,435.0,1,4,350000.0


### We will feed Machine Learning models with this version of the DataSet

In [22]:
df.to_csv('../DataSets/MLproc_OLX_housing.csv',index_label=False)