In [61]:
import pandas as pd

In [62]:
data = pd.read_csv("../Intro_ML/Data/train.csv",index_col='Id')

In [63]:
data.dropna(subset=['SalePrice'],axis=0)
y = data.SalePrice
X_full = data.drop(['SalePrice'], axis=1)

In [64]:
y.isnull().any()

False

In [65]:
categorical_cols = [cname for cname in X_full.columns
                   if X_full[cname].dtype=='object' and
                   X_full[cname].nunique()<10]
numerical_cols = [cname for cname in X_full.columns
                 if X_full[cname].dtype in ['int64','float64']]
my_cols = categorical_cols + numerical_cols

In [66]:
X = X_full[my_cols].copy()
X.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,548,0,61,0,0,0,0,0,2,2008
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Feedr,Norm,...,460,298,0,0,0,0,0,0,5,2007
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,608,0,42,0,0,0,0,0,9,2008
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,642,0,35,272,0,0,0,0,2,2006
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,Norm,Norm,...,836,192,84,0,0,0,0,0,12,2008


In [67]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [68]:
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [69]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

In [70]:
def get_score(n_est):
    my_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=n_est, random_state=0))
    ])
    scores = -1 * cross_val_score(my_pipeline, X, y,
                                 cv=5,
                                 scoring='neg_mean_absolute_error')
    return scores.mean()

In [71]:
results = {i:get_score(i) for i in range(50,420,50)}
results

{50: 17820.591684931504,
 100: 17684.78860958904,
 150: 17603.220223744294,
 200: 17598.839119863016,
 250: 17599.66818082192,
 300: 17561.78423059361,
 350: 17563.871090019573,
 400: 17590.130779109586}

In [72]:
min_val = min(results.values())
which_est = [k for k,v in results.items() if v==min_val]
which_est

[300]

In [73]:
## therefore, estimator=400 is the best
the_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(n_estimators=400, random_state=0))
])

In [74]:
X_test_full = pd.read_csv("../Intro_ML/Data/test.csv", index_col='Id')
X_test = X_test_full[my_cols].copy()

In [75]:
pred1 = the_pipeline.fit(X, y)

In [76]:
pred = the_pipeline.predict(X_test)

In [77]:
len(pred)

1459

In [79]:
pred[0:10]

array([127299.4775, 154731.4925, 180390.2475, 182012.68  , 200962.86  ,
       183910.025 , 166126.69  , 175985.725 , 184669.3675, 120432.1525])

In [80]:
X.shape

(1460, 76)

In [81]:
output = pd.DataFrame({'Id':X_test.index,
                      'SalePrice':pred})

In [82]:
output.to_csv('submission.csv', index=False)