##Importando bibliotecas

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd

##Separando Dados
 - essa base representa valores de venda de casa em boston

In [2]:
x, y = load_boston(return_X_y=True)

print(x.shape)
print(y.shape) #o valor das casas estão na faixa de k de dolares

(506, 13)
(506,)


##Transformando dados de entradas em Dataframes
 - apenas para facilitar nosso trabalho
 - esses nomes foras retirados da documentação da propria base descrita no scikit-learn.org

In [3]:
df_x = pd.DataFrame(data = x , columns=['CRIM','ZN','INDUS','CHAS','NOX','RM',
                                        'AGE','DIS','RAD','TAX','PTRATIO', 
                                        'B 1000(Bk - 0.63)^2', 'LSTAT %'])
df_y = pd.DataFrame(data = y, columns=['MEDV'])

df_x.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
CRIM,0.00632,0.02731,0.02729,0.03237,0.06905,0.02985,0.08829,0.14455,0.21124,0.17004
ZN,18.0,0.0,0.0,0.0,0.0,0.0,12.5,12.5,12.5,12.5
INDUS,2.31,7.07,7.07,2.18,2.18,2.18,7.87,7.87,7.87,7.87
CHAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOX,0.538,0.469,0.469,0.458,0.458,0.458,0.524,0.524,0.524,0.524
RM,6.575,6.421,7.185,6.998,7.147,6.43,6.012,6.172,5.631,6.004
AGE,65.2,78.9,61.1,45.8,54.2,58.7,66.6,96.1,100.0,85.9
DIS,4.09,4.9671,4.9671,6.0622,6.0622,6.0622,5.5605,5.9505,6.0821,6.5921
RAD,1.0,2.0,2.0,3.0,3.0,3.0,5.0,5.0,5.0,5.0
TAX,296.0,242.0,242.0,222.0,222.0,222.0,311.0,311.0,311.0,311.0


##Separando poucos dados para primeira analise

In [4]:
X = df_x[['AGE', 'DIS', 'ZN']] #valor de entrada usaremos o ano, o tamanho e distancia da casa ao centro
Y = df_y['MEDV'] #valor de venda sera o nosso target

X.head().T

Unnamed: 0,0,1,2,3,4
AGE,65.2,78.9,61.1,45.8,54.2
DIS,4.09,4.9671,4.9671,6.0622,6.0622
ZN,18.0,0.0,0.0,0.0,0.0


##Primeira Analise

In [5]:
#separando dados de treino e teste
np.random.seed(100)

x_treino, x_teste, y_treino, y_teste = train_test_split(X, Y, test_size = 0.25)

print("foi separados %d dados de teino e %d dados de teste" % (len(x_treino), len(x_teste)))

foi separados 379 dados de teino e 127 dados de teste


In [6]:
from sklearn.linear_model import LinearRegression

modelo = LinearRegression()

modelo.fit(x_treino, y_treino)

previsao = modelo.predict(x_teste)

#calculando o MSE - É a média do quadrado da diferença entre os valores originais e os valores previstos.
mse = np.mean((previsao - y_teste)**2) #quando menor a variação desse erro o estimador pode aproximar valores de uma forma melhor

print("Erro quadratico Médio: %.3f " % mse)

Erro quadratico Médio: 83.907 


In [7]:
#calculando coeficientes e r-squared

#São os pesos atribuídos ao atributos para calcular o valor predito.
coef = pd.DataFrame(x_treino.columns)
coef['Coeficientes'] = pd.Series(modelo.coef_)

coef

Unnamed: 0,0,Coeficientes
0,AGE,-0.124293
1,DIS,-1.055009
2,ZN,0.120038


In [8]:
#calculando coeficientes e r-squared

#valor percentual de quando os dados estao proximos da medicao 
#esse percentual explica o quanto o modelo consegue explicar os valores observados

print("percentual de r-squared: %.3f"  %(modelo.score(x_teste, y_teste)*100))

percentual de r-squared: 14.965


##Segunda Analise
 - vamos atribuir mais valores

In [9]:
#verificando valores nulos
#é importante fazer esse tipo de verificação
#uma grande quantidade de valores nulos reduz o percentual de estimativa do nosso modelo


df_x['PTRATIO'].isnull().sum()

#caso exista valores nulos encontrados é possivel substituilos pela media total da colula
# df_x['Item_Weight'].fillna((df_x['Item_Weight'].mean()), inplace=True)

0

In [10]:
X = df_x[['AGE', 'DIS', 'ZN', 'PTRATIO']]
Y = df_y['MEDV']

X.head().T

Unnamed: 0,0,1,2,3,4
AGE,65.2,78.9,61.1,45.8,54.2
DIS,4.09,4.9671,4.9671,6.0622,6.0622
ZN,18.0,0.0,0.0,0.0,0.0
PTRATIO,15.3,17.8,17.8,18.7,18.7


In [11]:
#separando dados de treino e teste
np.random.seed(100)

x_treino, x_teste, y_treino, y_teste = train_test_split(X, Y, test_size = 0.25)

print("foi separados %d dados de teino e %d dados de teste" % (len(x_treino), len(x_teste)))

foi separados 379 dados de teino e 127 dados de teste


In [12]:
modelo.fit(x_treino, y_treino)

previsao = modelo.predict(x_teste)

#novo MSE
mse = np.mean((previsao - y_teste)**2)

print("novo Erro quadratico Médio: %.3f " % mse)

novo Erro quadratico Médio: 67.989 


In [13]:
#calculando coeficientes e r-squared
coef = pd.DataFrame(x_treino.columns)
coef['Coeficientes'] = pd.Series(modelo.coef_)

coef

Unnamed: 0,0,Coeficientes
0,AGE,-0.109916
1,DIS,-0.882585
2,ZN,0.05846
3,PTRATIO,-1.691518


In [14]:
#calculando coeficientes e r-squared
print("percentual do novo r-squared: %.3f"  %(modelo.score(x_teste, y_teste)*100)) #obtivemos um ganho percentual

percentual do novo r-squared: 31.097


##Preprocessamento de dados
 - é preciso trocar os dados com valor null
 - trocar os valores não numericos
 - e trocar os valores 0 

In [15]:
#como a nossa base é numerica e não possui valores null 
#só previsamos trocar os valores zerados

df_x['ZN'] = df_x['ZN'].replace(0,np.mean(df_x['ZN']))

df_x.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
CRIM,0.00632,0.02731,0.02729,0.03237,0.06905,0.02985,0.08829,0.14455,0.21124,0.17004
ZN,18.0,11.363636,11.363636,11.363636,11.363636,11.363636,12.5,12.5,12.5,12.5
INDUS,2.31,7.07,7.07,2.18,2.18,2.18,7.87,7.87,7.87,7.87
CHAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NOX,0.538,0.469,0.469,0.458,0.458,0.458,0.524,0.524,0.524,0.524
RM,6.575,6.421,7.185,6.998,7.147,6.43,6.012,6.172,5.631,6.004
AGE,65.2,78.9,61.1,45.8,54.2,58.7,66.6,96.1,100.0,85.9
DIS,4.09,4.9671,4.9671,6.0622,6.0622,6.0622,5.5605,5.9505,6.0821,6.5921
RAD,1.0,2.0,2.0,3.0,3.0,3.0,5.0,5.0,5.0,5.0
TAX,296.0,242.0,242.0,222.0,222.0,222.0,311.0,311.0,311.0,311.0


caso fosse preciso transformar dados não numericos poderia ser feito dessa forma 

from sklearn import preprocessing

le = preprocessing.LabelEncoder()


  def generate_labelencoder(atts):    
    
    for attr in atts:
      df[attr] = le.fit_transform(df[attr])
    return df

df = generate_labelencoder(['nome_da_coluna1', 'nome_da_coluna2'])

##Terceira analise
 - dessa vez usando todos as colunas

In [16]:
x = df_x.drop(['CHAS'], axis=1) #x irá receber todsas as colunas de df_X menos CHAS
y = df_y

x.head().T

Unnamed: 0,0,1,2,3,4
CRIM,0.00632,0.02731,0.02729,0.03237,0.06905
ZN,18.0,11.363636,11.363636,11.363636,11.363636
INDUS,2.31,7.07,7.07,2.18,2.18
NOX,0.538,0.469,0.469,0.458,0.458
RM,6.575,6.421,7.185,6.998,7.147
AGE,65.2,78.9,61.1,45.8,54.2
DIS,4.09,4.9671,4.9671,6.0622,6.0622
RAD,1.0,2.0,2.0,3.0,3.0
TAX,296.0,242.0,242.0,222.0,222.0
PTRATIO,15.3,17.8,17.8,18.7,18.7


In [17]:
#separando dados de treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.25)

print("foi separados %d dados de teino e %d dados de teste" % (len(x_treino), len(x_teste)))

foi separados 379 dados de teino e 127 dados de teste


In [18]:
modelo = LinearRegression()

modelo.fit(x_treino, y_treino)

previsao = modelo.predict(x_teste)

#novo MSE 
mse = np.mean((previsao - y_teste)**2)

print("novo Erro quadratico Médio: %.3f " % mse)

novo Erro quadratico Médio: 30.539 


In [19]:
# r-squared
print("percentual do novo r-squared: %.3f"  %(modelo.score(x_teste, y_teste)*100)) 

percentual do novo r-squared: 68.333
