In [None]:
# Standard imports
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Reading the datasets (train and test)
X = pd.read_csv('../input/home-data-for-ml-course/train.csv')
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)
X_test = pd.read_csv('../input/home-data-for-ml-course/test.csv')

In [None]:
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

In [None]:
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]

In [None]:
my_cols = categorical_cols + numerical_cols

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

In [None]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [None]:
# model = CatBoostRegressor()

In [None]:
def score(n_estimators):
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(n_estimators, random_state=0))
                             ])
    scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')
    return scores.mean()
    

In [None]:
# Bundle preprocessing and modeling code in a pipeline
# my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                               ('model', model)
#                              ])

In [None]:
# Multiply by -1 since sklearn calculates *negative* MAE
# scores = -1 * cross_val_score(my_pipeline, X, y,
#                               cv=5,
#                               scoring='neg_mean_absolute_error')

# print("MAE scores:\n", scores)


In [None]:
# results = {}
# for i in range(1,20):
#     results[50*i] = score(50*i)
#     print(results[50*i])

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# plt.plot(list(results.keys()), list(results.values()))
# plt.show()

In [None]:
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', RandomForestRegressor(400, random_state=0))
                             ])
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

In [None]:
# Preprocessing of training data, fit model 
my_pipeline.fit(X, y)

In [None]:
final_pred = my_pipeline.predict(X_test)

In [None]:
# Getting output and saving it in a csv file
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': final_pred})
output.to_csv('submission.csv', index=False)