# Import Libraries

In [110]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew
from scipy import stats
from sklearn.model_selection import cross_val_score,GridSearchCV

# Import Data

In [111]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [112]:
print(train_data.shape, test_data.shape)

In [113]:
train_data.head()

In [114]:
test_data.head()

# Data Preprocessing

### Exploring Target Variable


In [115]:
SalePrice = train_data['SalePrice']
SalePrice.describe()

In [116]:
ax = sns.boxplot(y = 'SalePrice', data = train_data)

- target variable is positively skew
- Normalizing target/dependent variable by logarit transformation

**Log-Transformation the target variable**







In [117]:
train_data["SalePrice"] = np.log1p(train_data["SalePrice"])

In [118]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
sns.distplot(SalePrice, ax=ax[0])
sns.distplot(train_data["SalePrice"], ax=ax[1], axlabel = 'Transformation SalePrice')

In [119]:
fig = plt.figure()
res = stats.probplot(SalePrice, plot=plt)
plt.show()
#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train_data['SalePrice'], plot=plt)
plt.show()

### Checking Duplicate 

In [120]:
print(train_data['Id'].nunique() == train_data['Id'].count())
print(test_data['Id'].nunique() == test_data['Id'].count())

In [121]:
#Save the 'Id' column
train_ID = train_data['Id']
test_ID = test_data['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train_data.drop("Id", axis = 1, inplace = True)
test_data.drop("Id", axis = 1, inplace = True)

#check again the data size after dropping the 'Id' variable
print("\nThe train data size after dropping Id feature is : {} ".format(train_data.shape)) 
print("The test data size after dropping Id feature is : {} ".format(test_data.shape))

### Missing Data


Important questions when thinking about missing data:

* How prevalent is the missing data?
* Is missing data random or does it have a pattern?

**Concat train dataset and test dataset**

In [122]:
ntrain = train_data.shape[0]
ntest = test_data.shape[0]

Y_train = train_data.SalePrice.values

all_data = pd.concat((train_data, test_data)).reset_index(drop=True)
all_data.drop(['SalePrice'], axis=1, inplace=True)
print("all_data size is : {}".format(all_data.shape))

In [123]:
total = all_data.isnull().sum().sort_values(ascending = False)
percent = (all_data.isnull().sum()/all_data.shape[0]).sort_values(ascending = False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

**How to deal with missing data**

There are 2 ways we can deal with missing data.

1. Delete variables or observations.

in cases of deleting variables, when more than 50 or 30% of the data is missing, we should delete the corresponding variable. The point here is 'will we miss this data?' No. it is likely that these variables (e.g. 'PoolQC', 'MiscFeature') are not important aspects to consider when buying a house.

In what concerns the remaining cases, less percentage of missing data, we can delete these observations with missing data. Consider reducing the size of dataset

2. Impute missing values

There is many options we could consider when replacing a missing value
- a constant value that has meaning within the domain, such as ), distinct from all other values
- a value from another randomly selected record
- a mean, median or mode value for a column
- a value estimated by another predictive model


Notes: Not all algorithms fail when there is missing data.
There  are algorithms that can be made rebust to missing data, such as k-Nearest Neighbors that can ignore a column from a distance measure when a value is missing. Naive Bayes can aslo support missing values when making a prediction.
Sadly, the scikit-learn implementations of naive bayes, decision tree and k-Nearest Neighbor are not rebust to missing values. Although t is being considered,




**Deleting variables with missing values**

we are going to delete varibles with ratio of missing values is greater than 15%.


In [124]:
all_data = all_data.drop((missing_data[missing_data['Percent'] >= 0.15]).index,1)
all_data.head()

**Imputing missing values**








- **GarageType, GarageFinish, GarageQual and GarageCond:** Replacing missing data with None


In [125]:
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    all_data[col] = all_data[col].fillna('None')


- **GarageYrBlt, GarageArea and GarageCars:** Replacing missing data with 0 since no garage is no cars in such garage

In [126]:
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    all_data[col] = all_data[col].fillna(0)

- **BsmtQual,BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2:** for all these categorical basement-related feartures, Nan means that there is no basement.

In [127]:
for col in ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']:
    all_data[col] = all_data[col].fillna('None')

- **MasVnrArea and MasVnrType** NA most likely menas no masonry veneer for these houses, We can fill ) for the area and Non for the type

In [128]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

- **Electrical :**  It has one NA value. Since this feature has mostly 'SBrkr', we can set that for the missing value.

In [129]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

- **MSZoning (The general zoning classification):** 'RL' is by far the most common value. So we can fill in missing values with 'RL

In [130]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

- **BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath** : missing values are likely zero for having no basement

In [131]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

- **BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1 and BsmtFinType2** : For all these categorical basement-related features, NaN means that there is no basement.

In [132]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

- **Functional**: data description says NA means typical

In [133]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")

- **Exterior1st and Exterior2nd** : Again Both Exterior 1 & 2 have only one missing value. We will just substitute in the most common string

In [134]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

- **SaleType** : Fill in again with most frequent which is "WD"

In [135]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

**Is there any remaining missing value?**

- **KitchenQual**: Only one NA value, and same as Electrical, we set 'TA' (which is the most frequent) for the missing value in KitchenQual.

In [136]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

- **Utilities** : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. We can then safely remove it.

In [137]:
all_data = all_data.drop(['Utilities'], axis=1)

In [138]:
all_data.isnull().sum().max()

## Handling Outliers

Before mentioning how outliers can be handeled, it is true that the best way to detect the outliers is to demonstrate the data visually. All other statistical methodologies are open to making mistakes, whereas visualizing the outliers gives a chance to take a secision with high precision. 

Statistical mothodologies are less precise but on the other hand, they have a superioty and fast.

There are two diffirent ways of handing outliers detecting them using **standard deviation** and **percentiles**

After detect outlier then we can decide to to delete outliers

**Examples. Detect outliers by visualizing**

In [139]:
ax = sns.boxplot(y = 'LotArea', data = train_data)

In [140]:
#bivariate analysis saleprice/LotArea
# var = 'LotArea'
# data = pd.concat([train_data['SalePrice'], train_data[var]], axis=1)
# data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));


The some values with bigger 'LotArea' seem strange and they are not following the crowd. we can define them as outliers and delete them.

**Examples. Detect outliers with Standard Deviation**

If a value has a distance to the average higher than **x * standard deviation**, it can be assumed as an outlier. Then what **x** should be?

there is no trivial solution for **x**, but usually, a value between 2 and 4 seems practicals. the default value is 3.

In [141]:
x = 3
upper_lim = train_data['LotArea'].mean () + train_data['LotArea'].std () * x
lower_lim = train_data['LotArea'].mean () - train_data['LotArea'].std () * x
print(upper_lim)
outliers = train_data[train_data['LotArea'] > upper_lim]['LotArea']
print(outliers)

**Examples. Detect outliers with Percentiles**
Another mathematical method to detect outliers is to use percentiles.

we can assume that a certain percent of the value from the top or the bottom as an outlier. the key point is here to set the percaentage value once again and this pepends on the distribution of the data.

Additionally, a common mistake is using the percentiles according to the range of the data. In other words, if your data ranges from 0 to 100, your top 5% is not the values between 96 and 100. Top 5% means here the values that are out of the 95th percentile of data.




In [142]:
#Dropping the outlier rows with Percentiles
upper_lim = train_data['LotArea'].quantile(.95)
lower_lim = train_data['LotArea'].quantile(.05)

print(upper_lim)

outliers = train_data[train_data['LotArea'] > upper_lim]['LotArea']
print(outliers)

Deleting Outliers

In [143]:
# Delete OutLiers
# train_data = train_data[(train_data['LotArea'] < upper_lim) & (train_data['LotArea'] > lower_lim)]

**Note**

Outliers removel is not always safe. We can decided to delete them as they are very huge and bad. For example, 1000 as an age of a person.

There are probably others outliers in the training data. However, removing all them may affect badly our models if ever there were also outliers in the test data. That's why, instead of removing them all. we will just mange to make some of our models robust on them.

### Transforming numerical variables that are categorical

In [144]:

# MSSubClass type of dwelling
all_data['MSSubClass'] = all_data['MSSubClass'].astype(str)

# Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].apply(str)

# Year and month sold are transform into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)


### Adding one more important feature

Since area related fetures are very important to determin house prices, we add one more feature which is the total are of basement, first and second floor areas of each house

In [145]:
# Adding total sqfootage feature

all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

### Lable Encoding some categorial variables which are ordial

In [146]:
from sklearn.preprocessing import LabelEncoder
cols = ('BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

### Skewed Features

In [147]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness

In [148]:
skewed_features = skewness.index
all_data[skewed_features] = np.log1p(all_data[skewed_features])

 ## Handling categorical variables

get dummies all categorical variables

In [149]:
all_data = pd.get_dummies(all_data)
all_data

In [150]:
train_data = all_data[:ntrain]
test_data = all_data[ntrain:]

# Model Building



3 Best Metrics to evaluate Regression Model

1. R Square

    R square is a good measure to determine how well the model fits the dependent variables. However, it doesn't take into consideration of overfitting problem.
    <img src="https://latex.codecogs.com/svg.latex?\Large&space; R^2= 1 - \frac{\sum_{i} (y_i-y_{ipred})^2}{\sum_{i} (y_i-y_{mean})^2}"/>
    
2. Mean Square Error / Root Mean Square Error(RMSE)
    
    Mean Square Error is an absolute measure of the goodness for the fit.

    <img src="https://latex.codecogs.com/svg.latex?\Large&space; MSE= \frac{1}{N} \sum^{N}_{i=1}  (y_i-y_{ipred})^2 "/>
    
    RMSE is square root of Mean Square Error

3. Mean Absolute Error(MAE)

    Compare to MSE/RMSE, MAE is more direct representation of sum of error terms.
    MSE gives larger penalisation to big prediction error by square it while MAE treats all error the same
    <img src="https://latex.codecogs.com/svg.latex?\Large&space; MAE= \frac{1}{N} \sum^{N}_{i=1}  |y_i-y_{ipred}| "/>

### Split Data

In [151]:
# Train Test split
X_train, X_test, y_train, y_test = train_test_split(train_data, Y_train, test_size=0.2, random_state=42)

### Automatic Feature Selection

The purposes of selecting features optimizing to get high performance of models and remove irrelevant features. It is also help reduce dimensions. 

There are three basic strategies: **univariate statistic, model-based selection and iterative selection**

**Univariate Statistics**


In univariate statistics, we compute whether there is a statistically significant relationship between each feature to the target. Then the features that are related with highest confident are selected.

Univariate tests are often very fasr to compute, and don't require building a model. On the other hand, they are completely independent of the model that you might want to apply after the feature selection.

In [152]:
# from sklearn.feature_selection import SelectPercentile
# select = SelectPercentile(percentile=80)
# select.fit(X_train, y_train)
# X_train = select.transform(X_train)
# X_test = select.transform(X_test)

In [153]:
X_train.shape

### Linear Regression (Ordinary least squares)

Linear regression is the simplest and most classic linear method for regression. Linear regression finds the parameters w and b that minimize the **mean squared error** between predictions and the true regression targets,y, on the training set. 

The mean squared error is the sum of the squared differences between the predictions and the true values.

In [154]:
# Linear Regression Model
lr = LinearRegression()
lr.fit(X_train, y_train)

In [155]:
#Prediction
y_pred = lr.predict(X_test)

**Evaluate Linear Regression**

In [156]:
# R square
R2 = lr.score(X_test, y_test)
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(R2))

In [157]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [158]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [159]:
E1 = ['Linear Regression', R2, rmse, mae ]

### Ridge Regression
Ridge regression is also a linear model for regression, so the formula it ues to make predictions is the same one for ordinary least squares. we want the magnitue of coefficients to be as small as possible, in other words, all entries of w should be close to zero. This means each feature should have as little effect on the outcome as possible, while still predicting well. Regularization means explicitly restricting a model to avoid overfitting. The particular kind used by ridge regression is known as L2 regularization.

In [160]:
ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(ridge.coef_ != 0)))

As you can see, the training ser score of Ridge is lower than for Linear Regression, while the test set score is higher. A less complex model means worse performance on training set, but better generalization. As we are only interested in generalization performance, we should choose the Rigde model over the LinearRegression model.


In this example, we used the defaul paramater alpha = 1.0. The optimim setting of alpha depends on the particular dataset we are using. Increasing alpha forces coefficients to move toward zero, which decreases training set performance but might help generalization.

**Searching best parameter for Ridge Regession**

In [161]:
best_score = 0
best_alpha = 0
for alpha in [0.01, 0.1, 1, 10, 100]:
    ridge = Ridge(alpha=alpha).fit(X_train, y_train)
    score = ridge.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_alpha = alpha
print('Best score: {:.2f}'.format(best_score))
print('Best alpha: {:.2f}'.format(best_alpha))

In [162]:
ridge = Ridge(alpha=0.1).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(ridge.coef_ != 0)))

In [163]:
#Prediction
y_pred = ridge.predict(X_test)

**Evaluate Ridge Regression**

In [164]:
# R square
R2 = ridge.score(X_test, y_test)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(R2))

In [165]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [166]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [167]:
E2 = ['Ridge Regression', R2, rmse, mae ]

### Lasso

An alternative to Ridge for regularizing linear regression. As with ridge regression, using the Lasso also retricts coefficients to be close to zero, but in slightly different way, called L1 regularization. The consequence of L1 regulazition is that using the Lasso, some coefficients are exactly zero. This means some fearures are entirely ignored by the model. This can be seen as a form of automatic feature selection. Having some coefficients be exactly zero often makes a model easier to interpret and can reveal the most important features of model. 

In [168]:
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))

In [169]:
lasso00001 = Lasso(alpha=0.0001, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso00001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso00001.coef_ != 0)))

**Prediction**

In [170]:

y_pred = lasso00001.predict(X_test)

**Evaluate Lasso Regression**

In [171]:
# R square
R2 = lasso00001.score(X_test, y_test)
print("Training set score: {:.2f}".format(lasso00001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso00001.score(X_test, y_test)))

In [172]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [173]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [174]:
E3 = ['Lasso Regression', R2, rmse, mae ]

### Decision Tree Regression


In [175]:
tree = DecisionTreeRegressor().fit(X_train, y_train)
print("Training set score: {:.2f}".format(tree.score(X_train, y_train)))
print("Test set score: {:.2f}".format(tree.score(X_test, y_test)))

As expected, the accuracy on training set is 100% because the leaves are pure, the tree was grown deeo enough that it could perfectly memorize all the labels on the traning data. The test accuracy is around 81% accuracy. this model is overfitting.

To avoid overfitting and not generalizing well to new data, we are going to restrict the depth of a decision tree, the tree can become arbitrarily deep and complex. Now let's apply pre-pruning to the tree, which will stop developing the tree before we perfectly fit to the training data. Here we set max_depth = 5. Limiting the depth of the tree decreases overfitting. this leads to lower accuracy on the training set but improvement on the test set.

In [176]:
tree = DecisionTreeRegressor(max_depth=5,random_state = 42).fit(X_train, y_train)
print("Training set score: {:.2f}".format(tree.score(X_train, y_train)))
print("Test set score: {:.2f}".format(tree.score(X_test, y_test)))

As we can see, Although the test set score increase to 76% accuracy, it's still low but this model is better than the previous model .

**Prediction**

In [177]:
y_pred = tree.predict(X_test)

**Evaluate Decision Tree Regression**

In [178]:
# R square
R2 = tree.score(X_test, y_test)
print("Training set score: {:.2f}".format(tree.score(X_train, y_train)))
print("Test set score: {:.2f}".format(tree.score(X_test, y_test)))

In [179]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [180]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [181]:
E4 = ['Decision Tree Regression', R2, rmse, mae ]

### Random Forests

The main drawback of decision trees is that they tend to overfit the training data. Random forests are one way to address this problem. A random forest is essentially a collection of decision trees, where each tree is slightly different from the others.

In [182]:
forest = RandomForestRegressor(n_estimators=10, random_state=42)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

In [183]:
forest = RandomForestRegressor(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

**Prediction**

In [184]:
y_pred = forest.predict(X_test)

**Evaluate Random Forests**

In [185]:
# R square
R2 = forest.score(X_test, y_test)
print("Training set score: {:.2f}".format(forest.score(X_train, y_train)))
print("Test set score: {:.2f}".format(forest.score(X_test, y_test)))

In [186]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [187]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [188]:
E5 = ['Random Forests', R2, rmse, mae ]

### XGboost

In [189]:
import xgboost
xgb1 = xgboost.XGBRegressor()

In [190]:
parameters = {
              'learning_rate': [.01, 0.1], #so called `eta` value
              'max_depth': [4,5],
              'n_estimators': [100]}

In [191]:
xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 5,
                        n_jobs = -1,
                        verbose=True)
xgb_grid.fit(X_train, y_train)

In [192]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

**Prediction**

In [193]:
y_pred = xgb_grid.predict(X_test)

In [194]:
# R square
R2 = xgb_grid.score(X_test, y_test)
print("Training set score: {:.2f}".format(xgb_grid.score(X_train, y_train)))
print("Test set score: {:.2f}".format(xgb_grid.score(X_test, y_test)))

In [195]:
# Root Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {:.2f}".format(rmse))

In [196]:
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print('Mean Absolute Error: ', mae)

In [197]:
E6 = ['XGboost', R2, rmse, mae ]

# Evaluate models

In [198]:
df = pd.DataFrame([E1, E2, E3, E4, E5, E6],
               columns =['Model', 'R Square', 'RMSE', 'MAE'])
df

### Submission

In [199]:
result = np.expm1(lasso00001.predict(test_data.values))
result

In [200]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = result
sub.to_csv('submission.csv',index=False)