In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

X_full = pd.read_csv('../input/home-data-for-ml-course/train.csv', index_col = 'Id')
X_full_test = pd.read_csv('../input/home-data-for-ml-course/test.csv', index_col = 'Id')

y = X_full['SalePrice'].copy()
X = X_full.drop(['SalePrice'], axis = 1)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

X_train = X.copy()
y_train = y.copy()
X_test = X_full_test.copy()

#col = 79
#obj_col = 43
#num_col = 36
#row = 1460

num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
obj_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

missing_num_cols = [col for col in num_cols if X_train[col].isnull().any()]

missing_obj_cols = [col for col in obj_cols if X_train[col].isnull().any()]
impute_const_cols = list(set(missing_obj_cols) - set(['MasVnrType', 'Electrical']))
impute_freq_cols = list(set(obj_cols) - set(impute_const_cols))

ord_enc_cols = ['ExterQual', 'ExterCond', 'BsmtQual','BsmtCond',
            'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 
            'GarageCond', 'PoolQC']

non_ord_enc_cols = list(set(obj_cols) - set(ord_enc_cols))
label_enc_cols = [col for col in non_ord_enc_cols if X_train[col].nunique()>=8]
oh_enc_cols = list(set(non_ord_enc_cols) - set(label_enc_cols))

#impute categorical variables in X_train
X_train[impute_const_cols] = X_train[impute_const_cols].fillna(value = 'Absent')
X_train[impute_freq_cols] = X_train[impute_freq_cols].apply(lambda x:x.fillna(x.value_counts().index[0]))

#impute categorical variables in X_test
X_test[impute_const_cols] = X_test[impute_const_cols].fillna(value = 'Absent')
X_test[impute_freq_cols] = X_test[impute_freq_cols].apply(lambda x:x.fillna(x.value_counts().index[0]))


cat = ['Absent','Po', 'Fa', 'TA', 'Gd', 'Ex']

#ordinalEncode respective categorical variables in X_train
ord_encoder = OrdinalEncoder(categories = [cat for col in range(0, len(ord_enc_cols))])
X_encoded = pd.DataFrame(ord_encoder.fit_transform(X_train[ord_enc_cols]))
X_encoded.index = X_train.index
X_train[ord_enc_cols] = X_encoded

#ordinalEncode respective categorical variables in X_test
X_encoded_test = pd.DataFrame(ord_encoder.transform(X_test[ord_enc_cols]))
X_encoded_test.index = X_test.index
X_test[ord_enc_cols] = X_encoded_test

label_enc_cols

In [None]:
from sklearn.preprocessing._encoders import _BaseEncoder
class new_OrdinalEncoder(_BaseEncoder):
    def __init__(self,cat_index='all'):
        self.dicts={}
        # cate_index is the categorical feature index list
        self.cat_index=cat_index
     
    def fit(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
             
    def fit_transform(self,df,*y):
        if self.cat_index=='all':
            self.cat_index=list(range(df.shape[1]))
        df_output=df.copy()
        for feat in self.cat_index:
            dic=np.unique(df.iloc[:,feat])
            dic=dict([(i,index) for index, i in enumerate(dic)])
            self.dicts[feat]=dic
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(lambda x: dic[x])
        return df_output
         
    def transform(self,df):
        df_output=df.copy()
        for feat in self.cat_index:
            dic=self.dicts[feat]
            df_output.iloc[:,feat]=df.iloc[:,feat].apply(self.unknown_value,args=(dic,))
        return df_output
     
    def unknown_value(self,value,dic): # It will set up a new interger for unknown values!
        try:
            return dic[value]
        except:
            return len(dic)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators = 700, learning_rate = 0.04, random_state = 0)

oh_encoder = OneHotEncoder(handle_unknown = 'ignore')
le_encoder = new_OrdinalEncoder()

preprocessor = ColumnTransformer([
    ('num_imputer', SimpleImputer(strategy = 'median'), num_cols),
    ('le_enc', le_encoder, label_enc_cols),
    ('oh_enc', oh_encoder, oh_enc_cols)
], remainder = 'passthrough')

my_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)
#score = -1 * (cross_val_score(my_pipeline, X_train, y_train, cv = 4, scoring='neg_mean_absolute_error'))
#score.mean()

In [None]:
"""
def get_score(n):
    model = RandomForestRegressor(n_estimators = n, random_state=0)
    my_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    score = -1 * (cross_val_score(my_pipeline, X_train, y_train, cv = 4, scoring='neg_mean_absolute_error'))
    return score.mean()

estimators_list = []
for n in estimators_list:
    print(n)
    print(get_score(n))

"""
pass

In [None]:
"""
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__n_estimators': [500, 600, 700, 800],
    'model__learning_rate': [0.02, 0.04, 0.05, 0.08]
}
grid_search = GridSearchCV(estimator = my_pipeline, cv = 4, param_grid = param_grid, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)
"""
pass

In [None]:
test_predictions = my_pipeline.predict(X_test)
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)