In [1]:
## Importação dos pacotes
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import random

In [2]:
## Leitura da base de dados housing (pública)
housing_dataset = pd.read_csv('housing.csv')
housing_dataset.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
# Aplicação em uma amostra de 100 linhas
variaveis = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households']
inputNumbers =range(0,len(housing_dataset['longitude'])+1)
indices = random.sample(inputNumbers, 100)
data = housing_dataset[housing_dataset.index.isin(indices)][variaveis].copy()

In [4]:
## Aplicação do modelo de clusterização
kmeans = KMeans(n_clusters=5, random_state=0, n_init='auto').fit(data)

In [5]:
## Geração da base com a variável gerada
data['cluster'] = kmeans.labels_
data = data.sort_values(by='cluster')
data

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,cluster
5,52.0,919.0,213.0,413.0,193.0,0
6149,26.0,1399.0,277.0,1285.0,276.0,0
6280,35.0,1341.0,233.0,898.0,216.0,0
7146,45.0,1307.0,283.0,967.0,254.0,0
7383,39.0,1346.0,380.0,1520.0,356.0,0
...,...,...,...,...,...,...
5165,39.0,1701.0,428.0,1468.0,411.0,4
11426,17.0,1997.0,340.0,952.0,341.0,4
11820,18.0,1931.0,380.0,1271.0,377.0,4
14037,25.0,1908.0,513.0,956.0,467.0,4


In [6]:
centroides = kmeans.cluster_centers_
centroides = pd.DataFrame(centroides, columns = variaveis)
centroides = centroides.reset_index()
centroides = centroides.rename(columns = {'index': 'cluster'})
centroides

Unnamed: 0,cluster,housing_median_age,total_rooms,total_bedrooms,population,households
0,0,33.357143,1173.642857,247.785714,780.071429,237.607143
1,1,15.454545,6158.272727,1176.545455,3015.181818,1095.454545
2,2,29.4375,3545.1875,616.0,1646.4375,581.625
3,3,31.444444,2222.666667,612.666667,2336.888889,559.555556
4,4,33.222222,2179.138889,451.277778,1080.583333,411.75


In [7]:
# Aplicação em uma amostra de 100 linhas
variaveis = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_house_value']
inputNumbers =range(0,len(housing_dataset['longitude'])+1)
indices = random.sample(inputNumbers, 100)
data = housing_dataset[housing_dataset.index.isin(indices)][variaveis].copy()
data

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_house_value
0,41.0,880.0,129.0,322.0,126.0,452600.0
47,43.0,1007.0,312.0,558.0,253.0,137500.0
49,40.0,946.0,375.0,700.0,352.0,112500.0
457,42.0,1756.0,465.0,2184.0,422.0,371400.0
558,52.0,1049.0,185.0,374.0,176.0,248500.0
...,...,...,...,...,...,...
19728,32.0,1650.0,313.0,802.0,284.0,98200.0
19761,24.0,1631.0,340.0,1042.0,333.0,59000.0
19768,27.0,852.0,176.0,464.0,148.0,58200.0
20469,23.0,4550.0,762.0,2301.0,744.0,205300.0


In [8]:
## Calcular a variável dependente classe de preço: baixo, médio e alto
media = data['median_house_value'].mean()
print(media)
erro = data['median_house_value'].std() + np.sqrt(len(data['median_house_value']))
print(erro)
data['classe'] = data['median_house_value'].apply(
    lambda x : 'abaixo' if x < (media - erro) else (
    'acima' if x > (media + erro) else 'media'
)
)
data

209288.9292929293
121033.37413218484


Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_house_value,classe
0,41.0,880.0,129.0,322.0,126.0,452600.0,acima
47,43.0,1007.0,312.0,558.0,253.0,137500.0,media
49,40.0,946.0,375.0,700.0,352.0,112500.0,media
457,42.0,1756.0,465.0,2184.0,422.0,371400.0,acima
558,52.0,1049.0,185.0,374.0,176.0,248500.0,media
...,...,...,...,...,...,...,...
19728,32.0,1650.0,313.0,802.0,284.0,98200.0,media
19761,24.0,1631.0,340.0,1042.0,333.0,59000.0,abaixo
19768,27.0,852.0,176.0,464.0,148.0,58200.0,abaixo
20469,23.0,4550.0,762.0,2301.0,744.0,205300.0,media


In [9]:
data.groupby('classe').mean()

Unnamed: 0_level_0,housing_median_age,total_rooms,total_bedrooms,population,households,median_house_value
classe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abaixo,28.333333,1943.266667,392.8,969.466667,366.533333,68306.666667
acima,29.722222,3091.944444,588.444444,1314.888889,530.222222,412250.222222
media,31.818182,2435.075758,537.590909,1451.757576,500.515152,185977.272727


In [10]:
data.groupby('classe')['population'].count()

classe
abaixo    15
acima     18
media     66
Name: population, dtype: int64

In [11]:
## Aplicação do modelo de regressão logística com a variável dependente classe

from sklearn.linear_model import LogisticRegression
variaveis = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
'households', 'classe']
data = data[variaveis].copy().reset_index()
X_Train = data.drop(columns=['classe'], axis=1)
X_Test = data.drop(columns=['classe'], axis=1)
y_Train = data['classe']
y_Test = data['classe']
sc_x = StandardScaler()
X_Train = sc_x.fit_transform(X_Train)
X_Test = sc_x.fit_transform(X_Test)
logreg = LogisticRegression(solver="lbfgs", max_iter=500)
logreg.fit(X_Train, y_Train)

In [12]:
# Aplicação do modelo na base de predição
pred_logreg = logreg.predict(X_Test)
pred_logreg

array(['media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'acima', 'media', 'media', 'acima', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'acima', 'media', 'media', 'media',
       'media', 'media', 'abaixo', 'media', 'acima', 'abaixo', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'acima', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media', 'media', 'media', 'media', 'media', 'media', 'media',
       'media'], d

In [13]:
# Aplicação na base de predição com a geração das probabilidades de cada classe
pred_proba = logreg.predict_proba(X_Test)
pred_proba

array([[0.22817101, 0.26555169, 0.5062773 ],
       [0.16832823, 0.19075096, 0.64092081],
       [0.16088884, 0.1324776 , 0.70663355],
       [0.02738735, 0.03351802, 0.93909463],
       [0.15536281, 0.29725664, 0.54738055],
       [0.06718311, 0.26697993, 0.66583696],
       [0.17137804, 0.32438596, 0.50423601],
       [0.11625959, 0.28191801, 0.6018224 ],
       [0.32922207, 0.1910127 , 0.47976523],
       [0.16852429, 0.21264021, 0.6188355 ],
       [0.26788989, 0.25230071, 0.47980941],
       [0.1669944 , 0.24291524, 0.59009036],
       [0.12990348, 0.14329059, 0.72680593],
       [0.28949928, 0.15594687, 0.55455386],
       [0.14755684, 0.22344917, 0.62899399],
       [0.20268336, 0.22688859, 0.57042805],
       [0.17421692, 0.42438781, 0.40139527],
       [0.18446977, 0.21897577, 0.59655446],
       [0.18988148, 0.25614009, 0.55397844],
       [0.08025279, 0.51506415, 0.40468306],
       [0.14413008, 0.14369689, 0.71217303],
       [0.1874635 , 0.28126752, 0.53126898],
       [0.

In [14]:
## Geração da base com a variável gerada
data["previsao"] = pred_logreg
data

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,classe,previsao
0,0,41.0,880.0,129.0,322.0,126.0,acima,media
1,47,43.0,1007.0,312.0,558.0,253.0,media,media
2,49,40.0,946.0,375.0,700.0,352.0,media,media
3,457,42.0,1756.0,465.0,2184.0,422.0,acima,media
4,558,52.0,1049.0,185.0,374.0,176.0,media,media
...,...,...,...,...,...,...,...,...
94,19728,32.0,1650.0,313.0,802.0,284.0,media,media
95,19761,24.0,1631.0,340.0,1042.0,333.0,abaixo,media
96,19768,27.0,852.0,176.0,464.0,148.0,abaixo,media
97,20469,23.0,4550.0,762.0,2301.0,744.0,media,media


In [15]:
## Geração da base com as variáveis de probabilidades de cada classe
lista_proba = pred_proba.tolist()
lista_proba = pd.DataFrame(
    lista_proba,
    columns = ['abaixo', 'acima', 'media']
)
data = pd.merge(data, lista_proba, left_index=True, right_index=True)
data

Unnamed: 0,index,housing_median_age,total_rooms,total_bedrooms,population,households,classe,previsao,abaixo,acima,media
0,0,41.0,880.0,129.0,322.0,126.0,acima,media,0.228171,0.265552,0.506277
1,47,43.0,1007.0,312.0,558.0,253.0,media,media,0.168328,0.190751,0.640921
2,49,40.0,946.0,375.0,700.0,352.0,media,media,0.160889,0.132478,0.706634
3,457,42.0,1756.0,465.0,2184.0,422.0,acima,media,0.027387,0.033518,0.939095
4,558,52.0,1049.0,185.0,374.0,176.0,media,media,0.155363,0.297257,0.547381
...,...,...,...,...,...,...,...,...,...,...,...
94,19728,32.0,1650.0,313.0,802.0,284.0,media,media,0.202692,0.150616,0.646692
95,19761,24.0,1631.0,340.0,1042.0,333.0,abaixo,media,0.203399,0.097252,0.699349
96,19768,27.0,852.0,176.0,464.0,148.0,abaixo,media,0.293776,0.117527,0.588697
97,20469,23.0,4550.0,762.0,2301.0,744.0,media,media,0.080128,0.184630,0.735242
