In [1]:
import pandas as pd

In [2]:
X = pd.read_csv('./train.csv', index_col = 'Id')
X_test = pd.read_csv('./test.csv', index_col = 'Id')

In [3]:
# remove rows wiht missing values
X.dropna(axis = 0, subset=['SalePrice'], inplace=True)
y = X['SalePrice']
X.drop(columns=['SalePrice'], inplace=True)


In [4]:
cols_with_missing = [cols  for cols in X.columns if X[cols].isnull().any()]
X.drop(cols_with_missing,axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)


In [6]:
X_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
619,20,RL,11694,Pave,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,...,108,0,0,260,0,0,7,2007,New,Partial
871,20,RL,6600,Pave,Reg,Lvl,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,0,0,8,2009,WD,Normal
93,30,RL,13360,Pave,IR1,HLS,AllPub,Inside,Gtl,Crawfor,...,0,44,0,0,0,0,8,2009,WD,Normal
818,20,RL,13265,Pave,IR1,Lvl,AllPub,CulDSac,Gtl,Mitchel,...,59,0,0,0,0,0,7,2008,WD,Normal
303,20,RL,13704,Pave,IR1,Lvl,AllPub,Corner,Gtl,CollgCr,...,81,0,0,0,0,0,1,2006,WD,Normal


In [7]:
y_train.head()

Id
619    314813
871    109500
93     163500
818    271000
303    205000
Name: SalePrice, dtype: int64

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

In [9]:
### Drop columns with categorical data 

In [9]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])


In [10]:
print("MAE from drop columns with categorical data: ")
print(score_dataset(drop_X_train,drop_X_valid,y_train,y_valid))

MAE from drop columns with categorical data: 
17837.82570776256


In [12]:
### cheak a condition2 columns with categorical data

In [11]:
print("unique values in training data condition2: ",X_train['Condition2'].unique())
print("unique values in validation data condition2: ",X_valid['Condition2'].unique())

unique values in training data condition2:  ['Norm' 'PosA' 'Feedr' 'PosN' 'Artery' 'RRAe']
unique values in validation data condition2:  ['Norm' 'RRAn' 'RRNn' 'Artery' 'Feedr' 'PosN']


In [12]:
object_cols = [cols for cols in X_train.columns if X_train[cols].dtype == 'object']

print("Categorical columns: ", object_cols)
print("Number of categorical columns: ", len(object_cols))

Categorical columns:  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
Number of categorical columns:  27


In [13]:
good_label_cols = [cols for cols in object_cols
                    if 
                    set(X_valid[cols]).issubset(set(X_train[cols]))]
print("Good label columns: ", good_label_cols)
print("Number of good label columns: ", len(good_label_cols))

Good label columns:  ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']
Number of good label columns:  24


In [14]:
bad_label_cols = list(set(object_cols) - set(good_label_cols))
print("Bad label columns: ", bad_label_cols)
print("Number of bad label columns: ", len(bad_label_cols))

Bad label columns:  ['Functional', 'Condition2', 'RoofMatl']
Number of bad label columns:  3


In [15]:
print("categorical ordinal columns", good_label_cols)

categorical ordinal columns ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleType', 'SaleCondition']


In [16]:
from sklearn.preprocessing import OrdinalEncoder

label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(label_X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(label_X_valid[good_label_cols])

print("MAE from label encoding categorical data: ")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from label encoding categorical data: 
17098.01649543379


In [19]:
import joblib

# After training your model
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(label_X_train, y_train)

# Save the model to a file
joblib.dump(model, 'model.pkl')
print("Model saved as model.pkl")

Model saved as model.pkl
