# Regressao

[**House Prices:** Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

In [110]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from subprocess import call
from IPython.display import Image
from sklearn.tree import export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import numpy as np
import sklearn as sk 
from numpy import random
import seaborn as sns
from matplotlib import pyplot as plt
from IPython.display import Image
import os
pd.options.display.max_rows = 2000
%matplotlib inline

In [111]:
df = pd.read_csv(os.path.join(os.path.abspath(''), "data/train.csv"))
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Identificando dados faltando 

In [112]:
missing = df.isna().mean()
missing[missing.gt(0)] * 100

LotFrontage     17.739726
Alley           93.767123
MasVnrType       0.547945
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64

Os atributos `Alley`, `PoolQC`, `Fence` e `MiscFeature` possuem uma taxa de dados ausentes alta demais para serem gerados automaticamente. Podemos descartar esses atributos.

In [113]:
df.drop(columns=["Alley", "PoolQC", "Fence", "MiscFeature"], inplace=True)

## Preenchendo atributos de baixa taxa de ausencia com a moda

Atributos cuja taxa de ausencia e menor que 10% serao preenchidos pela moda da coluna correspondente.

In [114]:
missing = df.isna().mean() * 100
missing[missing.gt(0)] 

LotFrontage     17.739726
MasVnrType       0.547945
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
dtype: float64

In [115]:
# Colunas com taxa de ausencia no intervalo (0, 10)
lt_10 = missing[missing.gt(0)][missing.lt(10)].index.values

In [116]:
modes = df[lt_10].mode()
modes

Unnamed: 0,MasVnrType,MasVnrArea,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,GarageType,GarageYrBlt,GarageFinish,GarageQual,GarageCond
0,,0.0,TA,TA,No,Unf,Unf,SBrkr,Attchd,2005.0,Unf,TA,TA


In [117]:
filled = df[lt_10].fillna(modes.iloc[0])
df = pd.merge(df, filled)

### Verificando resultado do preenchimento

In [118]:
missing = df.isna().mean()
(missing[missing.gt(0)] * 100)

LotFrontage    17.758186
FireplaceQu    44.836272
dtype: float64

## Preenchendo atributos com alta taxa de ausencia

Um algoritmo de inferencia como KNN pode ser utilizado para preencher as colunas `LotFrontage` e `FireplaceQu` com os valores de instancias semelhantes.

Para que esse algoritmo funcione e necessario que todos os valores sejam numericos, logo precisamos enumerar os atributos nao numericos

### Listando atributos nao numericos

In [119]:
non_numeric_attrs = df.select_dtypes("object").columns.values
print(non_numeric_attrs)

['MSZoning' 'Street' 'LotShape' 'LandContour' 'Utilities' 'LotConfig'
 'LandSlope' 'Neighborhood' 'Condition1' 'Condition2' 'BldgType'
 'HouseStyle' 'RoofStyle' 'RoofMatl' 'Exterior1st' 'Exterior2nd'
 'MasVnrType' 'ExterQual' 'ExterCond' 'Foundation' 'BsmtQual' 'BsmtCond'
 'BsmtExposure' 'BsmtFinType1' 'BsmtFinType2' 'Heating' 'HeatingQC'
 'CentralAir' 'Electrical' 'KitchenQual' 'Functional' 'FireplaceQu'
 'GarageType' 'GarageFinish' 'GarageQual' 'GarageCond' 'PavedDrive'
 'SaleType' 'SaleCondition']


### Preenchendo NaN com categoria "Desconhecido"

In [122]:
df[non_numeric_attrs] = df[non_numeric_attrs].fillna("Desconhecido")

### Criando mapa de encoders

Cada encoder no mapa sera capaz de encodificar e decodificar seu respectivo atributo categorico.

In [123]:
encoder_map = {}

for attr in non_numeric_attrs:
    encoder_map[attr] = sk.preprocessing.LabelEncoder()

for encoder_key in encoder_map:
    encoder_map[encoder_key].fit(df[encoder_key])

### Encodificando atributos nao numericos

In [124]:
for attr in non_numeric_attrs:
    df[attr] = encoder_map[attr].fit_transform(df[attr])

In [125]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1588 entries, 0 to 1587
Data columns (total 77 columns):
Id               1588 non-null int64
MSSubClass       1588 non-null int64
MSZoning         1588 non-null int64
LotFrontage      1306 non-null float64
LotArea          1588 non-null int64
Street           1588 non-null int64
LotShape         1588 non-null int64
LandContour      1588 non-null int64
Utilities        1588 non-null int64
LotConfig        1588 non-null int64
LandSlope        1588 non-null int64
Neighborhood     1588 non-null int64
Condition1       1588 non-null int64
Condition2       1588 non-null int64
BldgType         1588 non-null int64
HouseStyle       1588 non-null int64
OverallQual      1588 non-null int64
OverallCond      1588 non-null int64
YearBuilt        1588 non-null int64
YearRemodAdd     1588 non-null int64
RoofStyle        1588 non-null int64
RoofMatl         1588 non-null int64
Exterior1st      1588 non-null int64
Exterior2nd      1588 non-null int64
Mas

Todos os atributos agora estao numericos

In [None]:
#Preenche atributo com o valor encontrado atraves de KNN
attribute = "LotFrontage"
# Dataframe com intancias de attribute ausente filtradas
df_2 = df[df[attribute].isna() == False]

# Features serao todos os outros atributos nao ausentes
features = [column for column in df.isna().mean()[df.isna().mean() == 0].index]

# Conjuntos de teste e treinamento
X_train, X_test, y_train, y_test  = sk.model_selection.train_test_split(df_2[features], 
                                                                        df_2[attribute], 
                                                                        test_size=0.33, 
                                                                        random_state=42)

nn = KNeighborsClassifier(n_neighbors=3)

nn.fit(X_train,y_train)

In [None]:



neigh = KNeighborsClassifier(n_neighbors=3)

In [109]:
# Preenche dados categoricos ausentes utilizando do algoritmo KNN
def fill_missing(df,column_name):
    nn = NearestNeighbors(n_neighbors=3)
    
    X = df[].drop_na()
    Y = df[clumn_name].drop_na()
    
    nn.fit(X,Y)

SyntaxError: invalid syntax (<ipython-input-109-bea6d6112615>, line 5)