In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('../input/home-data-for-ml-course/train.csv',index_col='Id')
df_test = pd.read_csv('../input/home-data-for-ml-course/test.csv',index_col='Id')
df.head()

In [None]:
df.plot(kind='scatter',x='GrLivArea',y='SalePrice')

In [None]:
df.corr()['SalePrice'].sort_values(ascending=False).head(10)

In [None]:
X = df.drop('SalePrice',axis=1)
y = df['SalePrice']

X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.3,random_state=42)
X_train.shape

In [None]:
# PREPROCESSING

null_counts = df.isnull().sum().sort_values(ascending=False)
null_counts[null_counts > 0]/len(df) * 100

# seperate categorical vs numerical cols
df.dtypes.unique()
categ_cols = [i for i in X_train.columns if X_train[i].dtype=='O' and X_train[i].nunique() <= 15]  # this avoids OneHotEncoder to create so many columns for the different values, so we chose columns <= 15 different values here.
num_cols = [i for i in X_train.columns if X_train[i].dtype in ['int64','float64']]
mycols = categ_cols + num_cols

X_train = X_train[mycols].copy()
X_valid = X_valid[mycols].copy()
X_test = df_test[mycols].copy()

In [None]:
numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
                                        ('imputer',SimpleImputer(strategy='constant',fill_value='NA')),
                                        ('onehot',OneHotEncoder(handle_unknown='ignore'))
                                                                                        ])

# ColumnTransformer: Applies transformers (impute,..) to columns
preprocessor = ColumnTransformer(transformers=[
                                    ('num',numerical_transformer,num_cols),
                                    ('categ',categorical_transformer,categ_cols)
                                        ])

In [None]:
# MODELLING

model = XGBRegressor(verbosity=0, random_state=0)
my_pipeline = Pipeline(steps=[
                        ('preprocessor',preprocessor),
                        ('model',model)
])


my_pipeline.fit(X_train,y_train)
predicts = my_pipeline.predict(X_valid)

print('MAE: ', mean_absolute_error(y_valid,predicts))

In [None]:
# Splitting
kfold = KFold(n_splits=5, shuffle=True)

# Calculate the Mean Absolute Error
scores = cross_val_score(my_pipeline,X_train, y_train,scoring='neg_mean_absolute_error',cv=kfold)

print('Average MAE score:', (scores * -1).mean())


In [None]:
# Use grdisearchCV to find us the best params of XGboost
param_grid = {'model__n_estimators': [10, 50, 100, 200, 400, 600],
              'model__max_depth': [2, 3, 5, 7, 10],
              'model__min_child_weight': [0.0001, 0.001, 0.01],
              'model__learning_rate': [0.01, 0.1, 0.5, 1]}

kfold = KFold(shuffle=True,random_state=0)  # by default it's 5 folds
gridsearch = GridSearchCV(my_pipeline,param_grid,scoring='neg_mean_absolute_error', cv=kfold, n_jobs=-1) # n_jobs=-1 --> use all the processors of the machine
grid_result = gridsearch.fit(X_train,y_train)

In [None]:
final_model = XGBRegressor(n_estimators=400, 
                           max_depth=3, 
                           min_child_weight=0.0001, 
                           learning_rate=0.1, 
                           verbosity=0, 
                           random_state=0
                           )

final_pipeline = Pipeline(steps=[
                            ('preprocessor',preprocessor),
                            ('model',final_model) 
])

final_pipeline.fit(X_train,y_train)
final_prediction = final_pipeline.predict(X_test)  

In [None]:
output = pd.DataFrame({'Id':X_test.index,'SalePrice':final_prediction})
output.to_csv('sumbission.csv',index=False)