### Introdução Machine Learning

In [1]:
# importar bibliotecas

import pandas as pd
import numpy as np
import seaborn
import sidetable as stb
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#### Carregando os dados

In [2]:
houses = pd.read_csv('houses_sp.csv')
houses.head()

Unnamed: 0,address,district,area,bedrooms,garage,type,rent,total
0,Rua Herval,Belenzinho,21,1,0,Studio e kitnet,2400,2939
1,Avenida São Miguel,Vila Marieta,15,1,1,Studio e kitnet,1030,1345
2,Rua Oscar Freire,Pinheiros,18,1,0,Apartamento,4000,4661
3,Rua Júlio Sayago,Vila Ré,56,2,2,Casa em condomínio,1750,1954
4,Rua Barata Ribeiro,Bela Vista,19,1,0,Studio e kitnet,4000,4654


In [3]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11657 entries, 0 to 11656
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   address   11657 non-null  object
 1   district  11657 non-null  object
 2   area      11657 non-null  int64 
 3   bedrooms  11657 non-null  int64 
 4   garage    11657 non-null  int64 
 5   type      11657 non-null  object
 6   rent      11657 non-null  int64 
 7   total     11657 non-null  int64 
dtypes: int64(5), object(3)
memory usage: 728.7+ KB


In [4]:
houses.shape

(11657, 8)

####  Removendo dados não importantes

In [5]:
houses = houses.drop(['address', 'district'], axis = 1)

In [6]:
houses.head()

Unnamed: 0,area,bedrooms,garage,type,rent,total
0,21,1,0,Studio e kitnet,2400,2939
1,15,1,1,Studio e kitnet,1030,1345
2,18,1,0,Apartamento,4000,4661
3,56,2,2,Casa em condomínio,1750,1954
4,19,1,0,Studio e kitnet,4000,4654


#### Label Enconder

In [7]:
label_encoder = LabelEncoder()

houses['type'] = label_encoder.fit_transform(houses['type'])

In [8]:
houses.head()

Unnamed: 0,area,bedrooms,garage,type,rent,total
0,21,1,0,3,2400,2939
1,15,1,1,3,1030,1345
2,18,1,0,0,4000,4661
3,56,2,2,2,1750,1954
4,19,1,0,3,4000,4654


#### Dividindo em Treino e Teste

In [9]:
X = houses.drop(['rent', 'total'], axis = 1)
y = houses['rent']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8159, 4), (3498, 4), (8159,), (3498,))

In [12]:
cat_features = ['type']
numeric_features = ['area', 'bedrooms', 'garage']

#### Método Sem Pipeline e ColumnsTransformer

In [13]:
one_hot_encoder = OneHotEncoder()
one_hot_encoder

In [14]:
X_train_cat = one_hot_encoder.fit_transform(X_train[cat_features])
X_test_cat = one_hot_encoder.transform(X_test[cat_features])

In [15]:
X_train_processed = pd.concat([pd.DataFrame(X_train_cat.toarray()), X_train[numeric_features].reset_index()], axis = 1)
X_test_processed = pd.concat([pd.DataFrame(X_test_cat.toarray()), X_test[numeric_features].reset_index()], axis = 1)

In [16]:
X_train_processed

Unnamed: 0,0,1,2,3,index,area,bedrooms,garage
0,0.0,0.0,1.0,0.0,9315,92,2,2
1,0.0,0.0,1.0,0.0,7425,130,2,1
2,1.0,0.0,0.0,0.0,3326,72,3,1
3,1.0,0.0,0.0,0.0,7950,64,2,1
4,1.0,0.0,0.0,0.0,6542,59,2,1
...,...,...,...,...,...,...,...,...
8154,1.0,0.0,0.0,0.0,4859,144,3,2
8155,1.0,0.0,0.0,0.0,3264,42,2,0
8156,0.0,0.0,0.0,1.0,9845,18,1,0
8157,0.0,1.0,0.0,0.0,10799,160,2,3


In [17]:
X_test_processed

Unnamed: 0,0,1,2,3,index,area,bedrooms,garage
0,1.0,0.0,0.0,0.0,10750,70,2,1
1,1.0,0.0,0.0,0.0,9613,56,2,2
2,0.0,1.0,0.0,0.0,11032,129,3,4
3,0.0,1.0,0.0,0.0,5792,125,2,3
4,0.0,1.0,0.0,0.0,4231,100,2,2
...,...,...,...,...,...,...,...,...
3493,0.0,0.0,1.0,0.0,987,35,1,0
3494,1.0,0.0,0.0,0.0,9537,83,2,1
3495,1.0,0.0,0.0,0.0,4488,37,1,0
3496,0.0,0.0,0.0,1.0,2570,33,2,0


#### Modelo Preditivo

In [18]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [19]:
xgb = XGBRegressor()
xgb

In [20]:
# Treino o modelo no dataset de treino 

xgb.fit(X_train_processed, y_train)

In [21]:
# aplico o treino no teste

pred = xgb.predict(X_test_processed)

In [22]:
mse = mean_squared_error(y_test, pred)
print(mse)

2980708.322807289


#### Como seria com o Column Transformer?

In [23]:
from sklearn.compose import ColumnTransformer

In [24]:
cat_features = ['type']
numeric_features = ['area', 'bedrooms', 'garage']

transformations = [
    ('ohe', OneHotEncoder(drop = 'first'), cat_features),
    ('scaler', StandardScaler(), numeric_features)
]

In [27]:
preprocessor = ColumnTransformer(transformers = transformations)

In [29]:
X_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed = preprocessor.transform(X_test)

In [30]:
xgb.fit(X_train_transformed, y_train)

In [32]:
predict = xgb.predict(X_test_transformed)
mse = mean_squared_error(y_test, predict)
print(f'Mean Squared Error : {mse}')

Mean Squared Error : 3420449.671622455
