In [2]:
# Importando as bibliotecas

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

Lendo e analisando os dados

In [3]:
casas_df = pd.read_csv("housing.csv")
display(casas_df.head(5))
casas_df.info()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


# TRATANDO OS DADOS

In [4]:
# CRIANDO CATEGORIAS

# Criando coluna "Pessoas por casa"
casas_df["person per house"] = (casas_df["population"] / casas_df["households"]).round(3)
# Tratando alguns erros dessa coluna
casas_df.loc[casas_df["person per house"] > 14] = casas_df["person per house"].mean()

# Criando a coluna "Quartos por casa"
casas_df["bedrooms per house"] = (casas_df["total_bedrooms"] / casas_df["households"]).round(3)
# Tratando alguns problemas
casas_df = casas_df[casas_df["bedrooms per house"] <= 10]

# Criando a coluna "Cômodos por casa"
casas_df["rooms per house"] = (casas_df["total_rooms"] / casas_df["households"]).round(3)

# Tratado um erro da coluna "ocean_proximity"
casas_df = casas_df[casas_df["ocean_proximity"].isin(["<1H OCEAN", "INLAND", "ISLAND", "NEAR BAY", "NEAR OCEAN"])]

In [5]:
# Criando os grupos sociais para estretificar(dividido em classe baixa, classe média e classe alta)

# Organizando por ordem 
casas_organizadas_por_renda = casas_df["median_income"].sort_values().reset_index(drop=True)

# Definindo o índice do ultimo participante da classe baixa e media
renda_base = np.min(casas_df["median_income"])
um_porcento_da_renda = (np.max(casas_df["median_income"]) - np.min(casas_df["median_income"])) / 100

classe_baixa_limite = renda_base + 28 * um_porcento_da_renda # 4.559956
classe_media_limite = renda_base + 81 * um_porcento_da_renda # 12.245061999999999

indice_ultima_casa_classe_baixa = np.argmax(casas_organizadas_por_renda >= classe_baixa_limite).astype(int) + 1
indice_ultima_casa_classe_media = np.argmax(casas_organizadas_por_renda >= classe_media_limite).astype(int) + 1

casas_df["class"] = pd.cut(
    casas_df["median_income"],
    bins = [
        0,
        casas_organizadas_por_renda[indice_ultima_casa_classe_baixa],
        casas_organizadas_por_renda[indice_ultima_casa_classe_media],
        np.inf
        ],
    labels = ["Lower class", "Middle class", "Upper class"]
)

# Observando as correlações

In [6]:
display(casas_df[["longitude",
          "latitude",
          "housing_median_age", 
          "total_rooms", 
          "total_bedrooms", 
          "population", 
          "households", 
          "median_income", 
          "median_house_value", 
          "person per house", 
          "bedrooms per house", 
          "rooms per house"]
          ].corr().round(3)
)

valor_das_casas_de_acordo_com_a_proximidade_do_oceano = casas_df.groupby("ocean_proximity")["median_house_value"].mean().round(2).reset_index()
valor_das_casas_de_acordo_com_a_proximidade_do_oceano = valor_das_casas_de_acordo_com_a_proximidade_do_oceano.sort_values("median_house_value", ascending=False)
display(valor_das_casas_de_acordo_com_a_proximidade_do_oceano)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,person per house,bedrooms per house,rooms per house
longitude,1.0,-0.925,-0.109,0.045,0.069,0.101,0.057,-0.015,-0.045,0.157,0.02,-0.036
latitude,-0.925,1.0,0.012,-0.036,-0.067,-0.109,-0.071,-0.08,-0.145,-0.15,0.079,0.12
housing_median_age,-0.109,0.012,1.0,-0.361,-0.321,-0.298,-0.303,-0.119,0.106,-0.004,-0.11,-0.196
total_rooms,0.045,-0.036,-0.361,1.0,0.93,0.862,0.919,0.198,0.133,-0.109,0.048,0.178
total_bedrooms,0.069,-0.067,-0.321,0.93,1.0,0.883,0.98,-0.008,0.05,-0.144,0.071,0.004
population,0.101,-0.109,-0.298,0.862,0.883,1.0,0.912,0.004,-0.025,0.178,-0.075,-0.076
households,0.057,-0.071,-0.303,0.919,0.98,0.912,1.0,0.013,0.065,-0.126,-0.057,-0.086
median_income,-0.015,-0.08,-0.119,0.198,-0.008,0.004,0.013,1.0,0.689,-0.062,-0.087,0.432
median_house_value,-0.045,-0.145,0.106,0.133,0.05,-0.025,0.065,0.689,1.0,-0.25,-0.073,0.193
person per house,0.157,-0.15,-0.004,-0.109,-0.144,0.178,-0.126,-0.062,-0.25,1.0,-0.079,-0.052


Unnamed: 0,ocean_proximity,median_house_value
2,ISLAND,380440.0
3,NEAR BAY,259294.4
4,NEAR OCEAN,249193.98
0,<1H OCEAN,240263.71
1,INLAND,124720.9


# Resultado da análise:

Muitos dados não estão correlacionados com os preços das casas(o que é ruim)

Um dos principais features relacionados ao valor da casa é a media da renda, Pessoas que ganham mais tendem a viverem
em casas mais caras

observe a ordem de proximidade com o oceano:
 
 1 - ISLAND
 
 2 - NEAR OCEAN
 
 3 - NEAR BAY	

 4 - <1H OCEAN	

 5 - INLAND	

Oberva-se que, no geral, o preço das casas tende a aumentar de acordo com a proximidade do Oceano

In [None]:
# Separando os dados treino e teste

divisao = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for treino_index, teste_index in divisao.split(casas_df, casas_df["class"]):
    dados_treino = casas_df.iloc[treino_index] # 16.324 itens
    dados_teste  = casas_df.iloc[teste_index ] #  4.081 itens