In [15]:
# Imports
import numpy as np
import pandas as pd
import math
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline 

In [2]:
# Carregando o dataset
boston = load_boston() 
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df['target'] = boston.target

In [3]:
# Formato do Dataset
print("Boston housing dataset tem {} observações com {} variáveis cada uma.".format(*df.shape))

Boston housing dataset tem 506 observações com 14 variáveis cada uma.


In [4]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [5]:
# Gerando número de observações e variáveis
observations = len(df)
variables = df.columns[:-1]

In [6]:
# Coletando x e y
X = df.iloc[:,:-1]
y = df['target'].values

### Analisando os Atributos com StatsModels

In [7]:
Xc = sm.add_constant(X)
modelo = sm.OLS(y, Xc)
modelo_v1 = modelo.fit()
modelo_v1.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Mon, 12 Jun 2023",Prob (F-statistic):,6.72e-135
Time:,19:30:31,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
CRIM,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
ZN,0.0464,0.014,3.382,0.001,0.019,0.073
INDUS,0.0206,0.061,0.334,0.738,-0.100,0.141
CHAS,2.6867,0.862,3.118,0.002,0.994,4.380
NOX,-17.7666,3.820,-4.651,0.000,-25.272,-10.262
RM,3.8099,0.418,9.116,0.000,2.989,4.631
AGE,0.0007,0.013,0.052,0.958,-0.025,0.027
DIS,-1.4756,0.199,-7.398,0.000,-1.867,-1.084

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


* O R2 (R-squared) indica quanto da variabilidade de y é explicado pelas variáveis preditoras. Nosso resultado foi de 74%.
*Há evidências de que uma variável está relacionada com o valor previsto, se o valor-p  for menor que 0,05. No nosso caso, as variaveis INDUS e AGE apresentaram valor-p > 0,05, ou seja, não há associação significativa entre a variavel preditora e a variavel resposta.

### Seleção de Atributos Com o R2

In [8]:
# Criando o modelo
modelo = linear_model.LinearRegression(normalize = False, fit_intercept = True)
def r2_est(X,y):
    return r2_score(y, modelo.fit(X,y).predict(X))
print ('Coeficiente R2: %0.3f' %  r2_est(X,y))

Coeficiente R2: 0.741


In [9]:
# Gera o impacto de cada atributo no R2
r2_impact = list()
for j in range(X.shape[1]):
    selection = [i for i in range(X.shape[1]) if i!=j]
    r2_impact.append(((r2_est(X,y) - r2_est(X.values[:,selection],y)), df.columns[j]))
    
for imp, varname in sorted(r2_impact, reverse = True):
    print ('%6.3f %s' %  (imp, varname))

 0.056 LSTAT
 0.044 RM
 0.029 DIS
 0.028 PTRATIO
 0.011 NOX
 0.011 RAD
 0.006 B
 0.006 ZN
 0.006 CRIM
 0.006 TAX
 0.005 CHAS
 0.000 INDUS
 0.000 AGE


    Com base neste resultado serão selecionadas as variaveis preditoras: LSTAT, RM, DIS e PTRATIO.

### Fazendo Previsões com o Modelo de Regressão Linear

In [10]:
# Coletando x e y
# Usaremos como variáveis explanatórias somente as 4 variáveis mais relevantes
X = df[['LSTAT', 'RM', 'DIS', 'PTRATIO']]
y = df['target'].values

In [11]:
# Divisão em dados de treino e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

# Criando o modelo
modelo = LinearRegression(normalize = False, fit_intercept = True)

# Treinamento do modelo
modelo = modelo.fit(X_train, y_train)

# Calcular a métrica R2 do nosso modelo
r2_score(y_test, modelo.fit(X_train, y_train).predict(X_test))

0.661383670138217

O R2 (coeficiente de correlação) é usado como uma medida de força da relação entre as variáveis e vai de 0 a 1. Nosso modelo apresentou r2 de aproximadamente 0.7, sendo considerado uma correlação forte.

### Avaliando o modelo

In [13]:
# Mean Squared Error (MSE)
from sklearn.metrics import mean_squared_error
y_pred = modelo.predict(X_test)
MSE = mean_squared_error(y_test,y_pred)
print("MSE: ", MSE)

MSE:  25.23135094783142


In [20]:
# Root Mean Square Error (RMSE)
RMSE = math.sqrt(MSE)
print("RMSE:", RMSE)

RMSE: 5.023081817752068


In [23]:
# MAPE
def mape(actual, pred):
  actual, pred = np.array(actual), np.array(pred)
  return np.mean(np.abs((actual - pred) / actual)) * 100
mape(y_test, y_pred)

# MAPE result Notes :

# Below 10% = Excellent
# 10%~20% = Good
# 21%~50% = Reasonable
# Above 50% = Inaccurate

18.53760969327174

### Fazendo as Previsões

In [21]:
LSTAT = 4
RM = 6.5
DIS = 6
PTRATIO = 15

# Lista com os valores das variáveis
dados_nova_casa = [LSTAT, RM, DIS, PTRATIO]

# Reshape
Xp = np.array(dados_nova_casa).reshape(1, -1)

# Previsão
print("Taxa Média de Ocupação Para a Casa:", modelo.predict(Xp))

Taxa Média de Ocupação Para a Casa: [31.32932624]


### Conclusão
O modelo de regressão regressão teve um resultado relativamente bom em prever a taxa média de ocupação das casas. 