In [113]:
#importação das bibliotecas
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [26]:
#importação da base de dados
data = pd.read_csv('HousingData.csv')
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [27]:
#verificando se existem dados nulos
data.isnull().sum()

CRIM       20
ZN         20
INDUS      20
CHAS       20
NOX         0
RM          0
AGE        20
DIS         0
RAD         0
TAX         0
PTRATIO     0
B           0
LSTAT      20
MEDV        0
dtype: int64

In [20]:
data["CRIM"].describe()

count    486.000000
mean       3.611874
std        8.720192
min        0.006320
25%        0.081900
50%        0.253715
75%        3.560263
max       88.976200
Name: CRIM, dtype: float64

In [28]:
#tratando valores nulos com o valor das medianas
medianCRIM = data["CRIM"].median(skipna = True)
medianCRIM

0.253715

In [31]:
data.fillna({"CRIM":medianCRIM},inplace = True)

In [32]:
medianZN = data["ZN"].median(skipna = True)
medianZN

0.0

In [34]:
data.fillna({"ZN":medianZN},inplace = True)

In [35]:
medianINDUS = data["INDUS"].median(skipna = True)
medianINDUS

9.69

In [36]:
data.fillna({"INDUS":medianINDUS},inplace = True)

In [37]:
medianCHAS = data["CHAS"].median(skipna = True)
medianCHAS

0.0

In [38]:
data.fillna({"CHAS":medianCHAS},inplace = True)

In [39]:
medianAGE = data["AGE"].median(skipna = True)
medianAGE

76.80000000000001

In [41]:
data.fillna({"AGE":medianAGE},inplace = True)

In [42]:
medianLSTAT = data["LSTAT"].median(skipna = True)
medianLSTAT

11.43

In [44]:
data.fillna({"LSTAT":medianLSTAT},inplace = True)

In [45]:
data.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [61]:
#verificação da descrição das colunas para o tratamento de outliers
for col in data.columns:
    print(data[col].describe())
    print("------------------------------")

count    506.000000
mean       3.479140
std        8.570832
min        0.006320
25%        0.083235
50%        0.253715
75%        2.808720
max       88.976200
Name: CRIM, dtype: float64
------------------------------
count    506.000000
mean      10.768775
std       23.025124
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max      100.000000
Name: ZN, dtype: float64
------------------------------
count    506.000000
mean      11.028893
std        6.704679
min        0.460000
25%        5.190000
50%        9.690000
75%       18.100000
max       27.740000
Name: INDUS, dtype: float64
------------------------------
count    506.000000
mean       0.067194
std        0.250605
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        1.000000
Name: CHAS, dtype: float64
------------------------------
count    506.000000
mean       0.554695
std        0.115878
min        0.385000
25%        0.449000
50%        0.538000
75%      

In [63]:
#tratamento de outliers
#a partir da analise feita sobre as colunas, no meu entendimento a única passível de conter outliers seria CRIM(referente a taxa de criminalidade per capita)
#definicão do Q1,Q3 e IQR
Q1 = data["CRIM"].quantile(0.25)
Q3 = data["CRIM"].quantile(0.75)
IQR = Q3 - Q1

In [67]:
#tratamento dos outliers com base no quartiles e intervalo quartil
data.loc[(data["CRIM"] < (Q1 - 1.5 * IQR)) | (data["CRIM"] > (Q3 + 1.5 * IQR)), "CRIM"] = medianCRIM

In [68]:
data["CRIM"].describe()

count    506.000000
mean       0.746769
std        1.410680
min        0.006320
25%        0.083235
50%        0.253715
75%        0.411213
max        6.801170
Name: CRIM, dtype: float64

In [72]:
#separação dos dados em features e target
features = data.iloc[:,:-1].values
target = data.iloc[:,-1].values

In [74]:
#normalização dos dados
scaler = StandardScaler()
features = scaler.fit_transform(features)

In [76]:
#antes da normalização
data.iloc[0,:-1]

CRIM         0.00632
ZN          18.00000
INDUS        2.31000
CHAS         0.00000
NOX          0.53800
RM           6.57500
AGE         65.20000
DIS          4.09000
RAD          1.00000
TAX        296.00000
PTRATIO     15.30000
B          396.90000
LSTAT        4.98000
Name: 0, dtype: float64

In [75]:
#depois da normalização
features[0]

array([-0.52540697,  0.31436884, -1.30170603, -0.26839132, -0.14421743,
        0.41367189, -0.1327705 ,  0.1402136 , -0.98284286, -0.66660821,
       -1.45900038,  0.44105193, -1.09619345])

In [94]:
#matriz de correlação
data.corr()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
CRIM,1.0,-0.218494,0.401993,0.08887,0.453641,-0.071589,0.281339,-0.29357,0.43196,0.437014,0.125001,-0.144918,0.125418,-0.082141
ZN,-0.218494,1.0,-0.507304,-0.032992,-0.498619,0.312295,-0.535341,0.632428,-0.300061,-0.304385,-0.394622,0.170125,-0.398838,0.362292
INDUS,0.401993,-0.507304,1.0,0.054693,0.738387,-0.377978,0.614248,-0.698621,0.592735,0.716267,0.385366,-0.35484,0.564508,-0.476394
CHAS,0.08887,-0.032992,0.054693,1.0,0.070867,0.106797,0.074984,-0.092318,-0.003339,-0.035822,-0.109451,0.050608,-0.047279,0.183844
NOX,0.453641,-0.498619,0.738387,0.070867,1.0,-0.302188,0.711864,-0.76923,0.611441,0.668023,0.188933,-0.380051,0.57304,-0.427321
RM,-0.071589,0.312295,-0.377978,0.106797,-0.302188,1.0,-0.239518,0.205246,-0.209847,-0.292048,-0.355501,0.128069,-0.604323,0.69536
AGE,0.281339,-0.535341,0.614248,0.074984,0.711864,-0.239518,1.0,-0.724354,0.447088,0.498408,0.261826,-0.268029,0.575022,-0.377572
DIS,-0.29357,0.632428,-0.698621,-0.092318,-0.76923,0.205246,-0.724354,1.0,-0.494588,-0.534432,-0.232471,0.291512,-0.483244,0.249929
RAD,0.43196,-0.300061,0.592735,-0.003339,0.611441,-0.209847,0.447088,-0.494588,1.0,0.910228,0.464741,-0.444413,0.467765,-0.381626
TAX,0.437014,-0.304385,0.716267,-0.035822,0.668023,-0.292048,0.498408,-0.534432,0.910228,1.0,0.460853,-0.441808,0.524156,-0.468536


In [109]:
#estrutura de repetição para a vizualização da correlação entre as colunas de features e targets
corr_matrix = data.corr()

for i in range(len(data.columns)-1):
    corr_value = corr_matrix.iloc[i, -1]
    print(f"a correlação da coluna {data.columns[i]} com a coluna target {data.columns[-1]} é de {corr_value:.4f}")

a correlação da coluna CRIM com a coluna target MEDV é de -0.0821
a correlação da coluna ZN com a coluna target MEDV é de 0.3623
a correlação da coluna INDUS com a coluna target MEDV é de -0.4764
a correlação da coluna CHAS com a coluna target MEDV é de 0.1838
a correlação da coluna NOX com a coluna target MEDV é de -0.4273
a correlação da coluna RM com a coluna target MEDV é de 0.6954
a correlação da coluna AGE com a coluna target MEDV é de -0.3776
a correlação da coluna DIS com a coluna target MEDV é de 0.2499
a correlação da coluna RAD com a coluna target MEDV é de -0.3816
a correlação da coluna TAX com a coluna target MEDV é de -0.4685
a correlação da coluna PTRATIO com a coluna target MEDV é de -0.5078
a correlação da coluna B com a coluna target MEDV é de 0.3335
a correlação da coluna LSTAT com a coluna target MEDV é de -0.7231


In [111]:
#estrutura de repetição para verificação de multicolinearidade

for i in range(len(corr_matrix.columns)):
    for j in range(i+1, len(corr_matrix.columns)):  # Para evitar comparar a mesma combinação duas vezes
        corr_value = corr_matrix.iloc[i, j]
        if abs(corr_value) > 0.8:  # Verifica a correlação absoluta
            print(f"A correlação entre {corr_matrix.columns[i]} na posicao {i} e {corr_matrix.columns[j]} na posicao {j} é de {corr_value:.4f}")

A correlação entre RAD na posicao 8 e TAX na posicao 9 é de 0.9102


In [103]:
#exclusao da coluna 8(RAD) por possuir uma correlaçao muito alta com TAX e por ter uma menor correlação com o TARGET(MEDV)
NewFeatures = np.delete(features, 8, axis = 1)

In [104]:
features[0]

array([-0.52540697,  0.31436884, -1.30170603, -0.26839132, -0.14421743,
        0.41367189, -0.1327705 ,  0.1402136 , -0.98284286, -0.66660821,
       -1.45900038,  0.44105193, -1.09619345])

In [106]:
NewFeatures[0]

array([-0.52540697,  0.31436884, -1.30170603, -0.26839132, -0.14421743,
        0.41367189, -0.1327705 ,  0.1402136 , -0.66660821, -1.45900038,
        0.44105193, -1.09619345])

In [156]:
#separação dos dados em treino e teste
x_train,x_test,y_train,y_test = train_test_split(NewFeatures, target,test_size = 0.2)

In [157]:
#criação e treinamento do modelo
RegressaoLinear = LinearRegression()
RegressaoLinear.fit(x_train,y_train)

In [158]:
#predição dos valores
forecast = RegressaoLinear.predict(x_test)
forecast

array([20.6387875 , 20.83179793, 29.29353471, 15.61983744, 29.9301209 ,
       26.56027436, 39.571393  , 27.67553094, 26.8198913 , 25.07984446,
       34.01795432, 18.25682545, 19.05590568, 32.48272995, 18.25685573,
       20.2263209 , 28.25967742, 21.457196  , 22.98964086, 24.29055463,
       20.94228017, 25.48528761, 21.26537672,  6.08732635, 35.53577037,
       31.80171172, 19.15952341, 33.74723406, 26.59143315, 20.29504801,
       24.33514845, 16.16992874, 27.92038083, 32.07377938, 31.27037931,
       10.65707557, 16.42348323, 17.55144689, 18.88076649, 21.8482047 ,
       24.71425152, 25.60254327, 17.60385956, 24.47464001, 31.54052541,
       32.98445159, 38.21727707, 28.36156481, 32.07484196,  9.16277106,
       29.38272952, 23.56518937, 23.61787207, 10.53800496, 10.80894899,
       23.46274263, 27.91902345, 24.07677168, 21.21727271, 32.38803292,
       17.90090317, 28.6763197 , 41.36039333, 26.03463707,  8.26218977,
       18.59035537, 25.08854376, 16.29064327, 30.37319511, 17.31

In [176]:
#MAPE
def MAPE(y_test,y_pred):
    return np.mean(np.abs((y_test-y_pred) / y_pred)) * 100

In [177]:
#RMSE
def RMSE(y_test,y_pred):
    return np.sqrt(np.mean(np.power((y_test-y_pred),2)))

In [178]:
#range de erro relative RMSE
def relativeRMSE(RMSE,Vmax,Vmin):
    return (RMSE/(Vmax-Vmin))*100

In [182]:
RMSEValue = RMSE(y_test,forecast)
Vmax = np.max(data["MEDV"])
Vmin = np.min(data["MEDV"])
relativeRMSEValue = relativeRMSE(RMSEValue,Vmax,Vmin)

print(f'O modelo erra em media {relativeRMSEValue:.2f}% da faixa total dos preços')

O modelo erra em media 9.94% da faixa total dos preços


In [166]:
print(MAPE(y_test,forecast))

15.669592583133152


In [167]:
print(RMSE(y_test,forecast))

4.473864083797974


In [161]:
from joblib import dump

dump(RegressaoLinear,'RegressaoLinear.dot')

['RegressaoLinear.dot']