## Processamento dos dados

### Importando bibliotecas e carregando a base de dados

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))
import setup

In [2]:
from processamento_dados import load_data, rename_columns, remove_columns, treats_null_values, create_column, save_dataframe_csv, encode_categorical

In [3]:
FILE = 'teste_indicium_precificacao.csv'
PATH = os.path.join('..','data')
DATAFRAME_PATH = os.path.join(PATH, FILE)
df = load_data(DATAFRAME_PATH)

In [4]:
df.head()

Unnamed: 0,id,nome,host_id,host_name,bairro_group,bairro,latitude,longitude,room_type,price,minimo_noites,numero_de_reviews,ultima_review,reviews_por_mes,calculado_host_listings_count,disponibilidade_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
2,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
3,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
4,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [5]:
standard_language_columns = {
    "nome": "name",
    "bairro_group": "neighborhood_group",
    "bairro": "neighborhood",
    "minimo_noites": "minimum_nights",
    "numero_de_reviews": "number_of_reviews",
    "ultima_review": "last_review",
    "reviews_por_mes": "reviews_per_month",
    "calculado_host_listings_count": "calculated_host_listings_count",
    "disponibilidade_365": "availability_365",
}
df = rename_columns(df, standard_language_columns)

In [6]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighborhood_group',
       'neighborhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

### Iniciando o tratamento de dados

removendo as colunas name e host_name pois seram inrrelevantes para o modelo

In [7]:
cols = ["name", "host_name"]
df = remove_columns(df, cols)

In [8]:
df.columns

Index(['id', 'host_id', 'neighborhood_group', 'neighborhood', 'latitude',
       'longitude', 'room_type', 'price', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

Criando uma serie has_review binária para indicar se um imóvem tem ou não review

In [9]:
data = df['last_review'].notna().astype(int)
name = 'has_review'
df = create_column(df, name, data)

In [None]:
df.head()

Unnamed: 0,id,host_id,neighborhood_group,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,has_review
0,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,1
1,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365,0
2,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,1
3,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,1
4,5099,7322,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,1


Tratando valores nulos nos campos de reviews_per_month e last_review

In [11]:
dict ={
    "reviews_per_month": 0,
    "last_review": "No review",
}
df = treats_null_values(df, dict)

In [12]:
df.isna().sum()

id                                0
host_id                           0
neighborhood_group                0
neighborhood                      0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
has_review                        0
dtype: int64

### Encoding dados categoricos

Encode neighborhood_group

In [13]:
df = encode_categorical(df, 'neighborhood_group')

In [14]:
df.head()

Unnamed: 0,id,host_id,neighborhood_group,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,has_review,neighborhood_group_encode
0,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,1,2
1,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,No review,0.0,1,365,0,2
2,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,1,1
3,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,1,2
4,5099,7322,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,1,2


In [15]:
df['neighborhood_group'].value_counts()

neighborhood_group
Manhattan        21661
Brooklyn         20103
Queens            5666
Bronx             1091
Staten Island      373
Name: count, dtype: int64

In [16]:
df['neighborhood_group_encode'].value_counts()

neighborhood_group_encode
2    21661
1    20103
3     5666
0     1091
4      373
Name: count, dtype: int64

In [17]:
neighborhood_group_encodes = df[['neighborhood_group', 'neighborhood_group_encode']].drop_duplicates().reset_index(drop=True)

In [18]:
neighborhood_group_encodes

Unnamed: 0,neighborhood_group,neighborhood_group_encode
0,Manhattan,2
1,Brooklyn,1
2,Queens,3
3,Staten Island,4
4,Bronx,0


In [19]:
name = 'neighborhood_group_labels'
save_dataframe_csv(neighborhood_group_encodes, name)

Encode em room_type

In [20]:
df = encode_categorical(df, 'room_type')

In [21]:
df.head()

Unnamed: 0,id,host_id,neighborhood_group,neighborhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,has_review,neighborhood_group_encode,room_type_encode
0,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355,1,2,0
1,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,No review,0.0,1,365,0,2,1
2,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194,1,1,0
3,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0,1,2,0
4,5099,7322,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129,1,2,0


In [22]:
df['room_type'].value_counts()

room_type
Entire home/apt    25409
Private room       22325
Shared room         1160
Name: count, dtype: int64

In [23]:
df['room_type_encode'].value_counts()

room_type_encode
0    25409
1    22325
2     1160
Name: count, dtype: int64

In [24]:
room_type_encodes = df[['room_type', 'room_type_encode']].drop_duplicates().reset_index(drop=True)

In [25]:
room_type_encodes

Unnamed: 0,room_type,room_type_encode
0,Entire home/apt,0
1,Private room,1
2,Shared room,2


In [26]:
name = 'room_type_labels'
save_dataframe_csv(room_type_encodes, name)

Enconde neigborhood

In [27]:
df = encode_categorical(df, 'neighborhood')

In [28]:
neighborhood_encodes = df[['neighborhood', 'neighborhood_encode']].drop_duplicates().reset_index(drop=True)

In [29]:
name = 'neighborhood_labels'
save_dataframe_csv(neighborhood_encodes, name)

### Ajustes finais e salvando os dados processados

In [30]:
cols = ["neighborhood_group", "room_type", "id", "host_id", "last_review", "neighborhood"]
df = remove_columns(df, cols)

In [31]:
df.shape

(48894, 12)

In [32]:
df = df[df['price'] > 0]
df.shape

(48883, 12)

In [33]:
df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,has_review,neighborhood_group_encode,room_type_encode,neighborhood_encode
0,40.75362,-73.98377,225,1,45,0.38,2,355,1,2,0,127
1,40.80902,-73.9419,150,3,0,0.0,1,365,0,2,1,94
2,40.68514,-73.95976,89,1,270,4.64,1,194,1,1,0,41
3,40.79851,-73.94399,80,10,9,0.1,1,0,1,2,0,61
4,40.74767,-73.975,200,3,74,0.59,1,129,1,2,0,137


In [34]:
name = 'teste_indicium_precificacao_final'
save_dataframe_csv(df, name)