In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df=pd.read_csv("melb_data.csv")
y=df['Price']
features=df.drop('Price',axis=1)
x=features.select_dtypes(exclude=['object'])

In [3]:
col_have_null=[col for col in x.columns if x[col].isna().any()]

In [4]:
col_have_null

['Car', 'BuildingArea', 'YearBuilt']

In [5]:
x=x.drop(col_have_null,axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
model = RandomForestRegressor(n_estimators=10, random_state=0)
model.fit(x_train, y_train)
preds = model.predict(x_test)
print("MAE from Approach 1 (Drop columns with missing values):")
mean_absolute_error(y_test,preds)

MAE from Approach 1 (Drop columns with missing values):


183550.22137772635

In [7]:
df=pd.read_csv("melb_data.csv")
y=df['Price']
features=df.drop('Price',axis=1)
x=features.select_dtypes(exclude=['object'])

In [8]:
from sklearn.impute import SimpleImputer
my_imputer=SimpleImputer()
imputed_x=pd.DataFrame(my_imputer.fit_transform(x),columns=x.columns)

In [9]:
x_train,x_test,y_train,y_test=train_test_split(imputed_x,y,test_size=0.2,random_state=0)

In [10]:
model = RandomForestRegressor(n_estimators=10, random_state=0)
model.fit(x_train, y_train)
preds = model.predict(x_test)
print("MAE from Approach 1 (Drop columns with missing values):")
xmean_absolute_error(y_test,preds)

MAE from Approach 1 (Drop columns with missing values):


177681.61305491268

### Practice

In [18]:
X_full=pd.read_csv("train.csv",index_col='Id')
X_test_full=pd.read_csv("test.csv",index_col='Id')

In [19]:
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)
# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [20]:
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1168, 36)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [21]:
num_rows = X_train.shape[0]

# Fill in the line below: How many columns in the training data
# have missing values?
num_cols_with_missing = len([col for col in X_train.columns if X_train[col].isna().any()])

# Fill in the line below: How many missing entries are contained in 
# all of the training data?
missing_val=X_train.isna().sum()
tot_missing = missing_val.sum()

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [23]:
# Fill in the line below: get names of columns with missing values
col_have_missing=[col for col in X_train.columns if X_train[col].isna().any()] # Your code here

# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(col_have_missing,axis=1)
reduced_X_valid = X_valid.drop(col_have_missing,axis=1)

In [24]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17837.82570776256


In [26]:
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer=SimpleImputer() # Your code here
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid =  pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns


In [27]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):
18062.894611872147


In [28]:
# Preprocessed training and validation features
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

final_X_train = final_X_train
final_X_valid = final_X_valid


In [29]:
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))

MAE (Your approach):
17791.59899543379


In [30]:
# Fill in the line below: preprocess test data
final_X_test = pd.DataFrame(final_imputer.fit_transform(X_test))

# Get test predictions
preds_test = model.predict(final_X_test)

In [31]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)