In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [5]:
# lendo o arquivo USA_Housing.csv
df = pd.read_csv('USA_Housing.csv')

# Exibindo as colunas
df.columns


Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
      dtype='object')

In [8]:
# verificando as primeiras instâncias
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


In [9]:
# identificar o tipo de cada variável
df.dtypes

Avg. Area Income                float64
Avg. Area House Age             float64
Avg. Area Number of Rooms       float64
Avg. Area Number of Bedrooms    float64
Area Population                 float64
Price                           float64
Address                          object
dtype: object

In [10]:
# excluir a coluna endereço
df.drop('Address',axis=1,inplace=True)

In [11]:
# Normalizando o dataframe
normalized_df=(df-df.min())/(df.max()-df.min())

In [13]:
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms','Avg. Area Number of Bedrooms', 'Area Population']]
y = df['Price']

In [14]:
X.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
0,79545.458574,5.682861,7.009188,4.09,23086.800503
1,79248.642455,6.0029,6.730821,3.09,40173.072174
2,61287.067179,5.86589,8.512727,5.13,36882.1594
3,63345.240046,7.188236,5.586729,3.26,34310.242831
4,59982.197226,5.040555,7.839388,4.23,26354.109472


In [15]:
y.head()

0    1.059034e+06
1    1.505891e+06
2    1.058988e+06
3    1.260617e+06
4    6.309435e+05
Name: Price, dtype: float64

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [18]:
from sklearn.linear_model import LinearRegression
# Aqui criamos o modelo preditor
modelopreditor = LinearRegression()
# Aqui ocorre o aprendizado
modelopreditor.fit(X_train,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
# Imprimindo os coeficientes com os nomes das colunas
coeff = pd.DataFrame(modelopreditor.coef_,X.columns,columns=['Coeficiente'])
coeff


Unnamed: 0,Coeficiente
Avg. Area Income,21.525435
Avg. Area House Age,166415.114396
Avg. Area Number of Rooms,119802.717039
Avg. Area Number of Bedrooms,1783.166098
Area Population,15.38754


In [20]:
# fazendo predições com os dados de teste
predictions = modelopreditor.predict(X_test)
# imprimindo as primeiras variáveis preditoras
X_test.head()


Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population
3431,50570.864807,5.828143,4.851423,4.36,40580.092291
2042,65314.720112,6.918945,6.245656,2.42,36565.029831
79,64419.252638,6.954422,8.51616,6.16,39318.170755
4663,60390.502855,5.195406,8.368913,6.27,37921.720586
3640,73068.518101,7.271422,5.685408,3.14,42929.876157


In [21]:
# Predições realizadas
predictions



array([ 631305.0584258 , 1231990.56161572, 1539664.80551512, ...,
       1251986.62330105, 1036432.95159371, 1553580.80137947])

In [22]:
# Valores reais para acertar
y_test.head()

3431    6.129387e+05
2042    1.347083e+06
79      1.492011e+06
4663    1.223915e+06
3640    1.368692e+06
Name: Price, dtype: float64

In [24]:
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))


MAE: 80728.9338454242
MSE: 10077066685.875519
RMSE: 100384.5938671643
