In [1]:
# Import Important Packages
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
X_full = pd.read_csv("C:\\Users\\ENVY 14\\Documents\\Python_Machine_Learning\\Datasets\\train.csv")
X_test_full = pd.read_csv("C:\\Users\\ENVY 14\\Documents\\Python_Machine_Learning\\Datasets\\test.csv")

In [4]:
# Remove the rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset='SalePrice', inplace=True)
y = X_full.SalePrice
X_full_train = X_full.drop('SalePrice', axis=1)

In [6]:
# Convert data type to pandas dataframe
X_full_train = pd.DataFrame(X_full_train)
y = pd.DataFrame(y)

In [7]:
# We use only numerical types
X = X_full_train.select_dtypes(exclude=['object'])
y = y.select_dtypes(exclude=['object'])

In [10]:
X.shape
y.shape

(1460, 1)

In [11]:
# Splitting the dataset into training and test sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

With random_state = 0, we get the same train asnd test sets across different executions

#### Step 1: Preliminary Investigation

In [14]:
# Shape of the traini9ing data
print(X_train.shape)

(1168, 37)


In [15]:
# Number of missing values for each column
Missing_values_by_col = X_train.isna().sum()
Missing_values_by_col = Missing_values_by_col[Missing_values_by_col > 0]
Total_missing = Missing_values_by_col.sum()
print(Missing_values_by_col)
print(Total_missing)

LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64
276


In [86]:
# How many rows are in training set
X_train.shape[0]

1168

##### To compare different values in dealing with missing values

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_test):
    model = RandomForestRegressor(n_estimators=80, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return mean_absolute_error(y_valid, y_pred)

#### Step 2: Drop Columns with missing values 

In [24]:
#Get names of columns with missing values
columns_with_missing = X_train.columns[X_train.isnull().any()].tolist()

# Drop columns in training and validation data
reduced_X_train = X_train.drop(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], axis=1)
reduced_X_valid = X_valid.drop(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], axis=1)


In [25]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):


  return fit_method(estimator, *args, **kwargs)


17788.585659246575


#### Step 3: Imputation


In [26]:
from sklearn.impute import SimpleImputer

# Fill in the lines below: imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Fill in the lines below: imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns


In [27]:
print("MAE (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (Imputation):


  return fit_method(estimator, *args, **kwargs)


18290.52452910959


Given that there are so few missing values in the dataset, we'd expect imputation to perform better than dropping columns entirely. However, we see that dropping columns performs slightly better! While this can probably partially be attributed to noise in the dataset, another potential explanation is that the imputation method is not a great match to this dataset. That is, maybe instead of filling in the mean value, it makes more sense to set every missing value to a value of 0, to fill in the most frequently encountered value, or to use some other method. For instance, consider the GarageYrBlt column (which indicates the year that the garage was built). It's likely that in some cases, a missing value could indicate a house that does not have a garage. Does it make more sense to fill in the median value along each column in this case? Or could we get better results by filling in the minimum value along each column? It's not quite clear what's best in this case, but perhaps we can rule out some options immediately - for instance, setting missing values in this column to 0 is likely to yield horrible results!

In [28]:
# Preprocessed training and validation features
# Imputation
my_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

final_X_train.columns = imputed_X_train.columns
final_X_valid.columns = imputed_X_valid.columns

In [29]:
# Define and fit model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)

# Get validation predictions and MAE
preds_valid = model.predict(final_X_valid)
print("MAE (Your approach):")
print(mean_absolute_error(y_valid, preds_valid))

  return fit_method(estimator, *args, **kwargs)


MAE (Your approach):
18103.602945205483


In [54]:
# Preprocess test data
X_test_full = pd.DataFrame(X_test_full)
X_test_full.dropna(axis=0, inplace=True)
X_test = X_full_train.select_dtypes(exclude=['object'])
final_imputer = SimpleImputer(strategy='median')
final_X_test = pd.DataFrame(final_imputer.fit_transform(X_test))
final_X_test.columns = X_test.columns


In [53]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y_train)


# Get test predictions
preds_test = model.predict(final_X_test)



  return fit_method(estimator, *args, **kwargs)
