# Preprocessing + Pipeline building

This notebook builds on where we left in the last notebook

In [1]:
import pandas as pd

train_path = "../input/train.csv"

df = pd.read_csv(train_path)

### Handling missing values

* Drop columns with missing values
* Imputation: Fills in missing values

#### Drop columns with missing values

In [2]:
cols_with_missing_data = [col for col in df.columns if df[col].isnull().any()]
reduced_original_data = df.drop(cols_with_missing_data, axis=1)

Testing our model on this data

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

y = df["SalePrice"]
X = df.drop("SalePrice", axis=1)
# For the sake of keeping the example simple, we'll use only numeric predictors
numeric_predictors = df.select_dtypes(exclude=['object'])

In [4]:
def get_mae(train_X, train_y, test_X, test_y):
    regressor = RandomForestRegressor(random_state=0)
    regressor.fit(train_X, train_y)
    predictions = regressor.predict(test_X)
    error = mean_absolute_error(test_y, predictions)
    return error

In [15]:
train_X, test_X, train_y, test_y = train_test_split(numeric_predictors, y, test_size=0.30, random_state=0)

In [16]:
cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]
reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_test_X = test_X.drop(cols_with_missing, axis=1)

print(get_mae(reduced_train_X, train_y, reduced_test_X, test_y))

923.912785388


### Imputation

In [9]:
from sklearn.preprocessing import Imputer

imputer = Imputer()
imputed_train_X = imputer.fit_transform(train_X)
imputed_test_X = imputer.transform(test_X)

In [10]:
print(get_mae(imputed_train_X, train_y, imputed_test_X, test_y))

1003.7890411


### Handling categorical data

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

def get_mae(X, y):    
    return -1 * cross_val_score(RandomForestRegressor(50), X, y, scoring = 'neg_mean_absolute_error').mean()

predictors_without_categoricals = imputed_train_X.select_dtypes(exclude=["object"])

one_hot_encoded_training_predictors = pd.get_dummies(imputed_train_X)

get_mae(predictors_without_categoricals, y)

AttributeError: 'numpy.ndarray' object has no attribute 'select_dtypes'