In [13]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [3]:
#for our model training
X = pd.read_csv('home-data-for-ml-course/train.csv', index_col='Id') 
X_test = pd.read_csv('home-data-for-ml-course/test.csv', index_col='Id')

In [5]:
# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

In [6]:
#object-less and object-with split
X_train_obless = X.select_dtypes(exclude=['object'])
X_train_obwith = X.select_dtypes(include=['object'])
X_test_obless = X_test.select_dtypes(exclude=['object'])
X_test_obwith = X_test.select_dtypes(include=['object'])

In [7]:
#removing GarageYrBlt
cols_delete_obless = ['GarageYrBlt']
reduced_X_train_obless = X_train_obless.drop(cols_delete_obless, axis=1)
reduced_X_test_obless = X_test_obless.drop(cols_delete_obless, axis=1)

In [8]:
#imputing the rest of obless null values
from sklearn.impute import SimpleImputer
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(reduced_X_train_obless))
imputed_X_train.columns = reduced_X_train_obless.columns #final obless dataset train

imputed_X_test = pd.DataFrame(my_imputer.transform(reduced_X_test_obless))
imputed_X_test.columns = reduced_X_test_obless.columns   #final obless dataset test

In [9]:
#remove columns with dissimilar values in obwith
obwith_columns = [col for col in X_train_obwith.columns]
good_label_cols = [col for col in obwith_columns if 
                   set(X_train_obwith[col]) == set(X_test_obwith[col])]
        
bad_label_cols = list(set(obwith_columns)-set(good_label_cols))
similar_X_train_obwith = X_train_obwith.drop(bad_label_cols, axis=1)
similar_X_test_obwith = X_test_obwith.drop(bad_label_cols, axis=1)

In [10]:
#deleting high cardinality values and non required cols
cols_delete_obwith = ['Neighborhood']
reduced_X_train_obwith = similar_X_train_obwith.drop(cols_delete_obwith, axis=1)
reduced_X_test_obwith = similar_X_test_obwith.drop(cols_delete_obwith, axis=1)
obwith_cols_encode = [cols for cols in reduced_X_train_obwith.columns]


high_missing_cols = ['Alley','FireplaceQu','GarageType','GarageFinish','GarageCond','Fence']
label_encode_cols = ['MasVnrType','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
reduced_X_train_obwith = reduced_X_train_obwith.drop(high_missing_cols, axis=1)
reduced_X_test_obwith = reduced_X_test_obwith.drop(high_missing_cols, axis=1)

In [11]:
#Filling missing values in columns to be label encoded
for cols in label_encode_cols:
    reduced_X_train_obwith[cols]=reduced_X_train_obwith[cols].fillna("dm")
    reduced_X_test_obwith[cols]=reduced_X_test_obwith[cols].fillna("dm")

In [14]:
#label_encode the columns
en = LabelEncoder()
for cols in reduced_X_train_obwith.columns:
    reduced_X_train_obwith[cols] = en.fit_transform(reduced_X_train_obwith[cols])
    reduced_X_test_obwith[cols] = en.transform(reduced_X_test_obwith[cols])

In [17]:
#OH encoding the remaining columns
reduced_cols = [cols for cols in reduced_X_train_obwith.columns]
OH_encode_cols = list(set(reduced_cols)-set(label_encode_cols))
OH_encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(reduced_X_train_obwith[OH_encode_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(reduced_X_test_obwith[OH_encode_cols]))
OH_cols_train.index = reduced_X_train_obwith.index
OH_cols_test.index = reduced_X_test_obwith.index

In [18]:
#removing the OH_encoded cols and making final obwith dataset
left_X_train_obwith = reduced_X_train_obwith.drop(OH_encode_cols,axis=1)
left_X_test_obwith = reduced_X_test_obwith.drop(OH_encode_cols,axis=1)
final_X_train_obwith = pd.concat([left_X_train_obwith,OH_cols_train],axis=1)
final_X_test_obwith = pd.concat([left_X_test_obwith,OH_cols_test],axis=1)

In [19]:
#joining obwith with obless
final_X_train_obwith = final_X_train_obwith.reset_index()
final_X_test_obwith = final_X_test_obwith.reset_index()
final_X_train = pd.concat([imputed_X_train,final_X_train_obwith],axis=1)
final_X_test = pd.concat([imputed_X_test,final_X_test_obwith],axis=1)

In [20]:
#selecting model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_X_train, y)
preds_test = model.predict(final_X_test)

In [21]:
#Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)