In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all the columns of the dataframe

pd.pandas.set_option('display.max_columns',None)
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
train.head()

In [None]:
train.info()

Now lets explore what we have to predict

In [None]:
train['SalePrice'].describe()

In [None]:
print("Skewness = ", train['SalePrice'].skew())
print("Kurtosis = ", train['SalePrice'].kurtosis())

In [None]:
from scipy import stats
from scipy.stats import norm, skew
import seaborn as sns
import matplotlib.pyplot as plt
sns.distplot(train['SalePrice'], fit=norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

It can be clearly seen that the data is right skewed. Regression model performs better for normally distributed data.
As it can be seen that lower values and higher values makes our data deviating.So lets make variable trandform which can diminish this difference.
Log Transform seems viable for our purpose here

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train.loc[:, "SalePrice"] = np.log1p(train.loc[:, "SalePrice"])

#Check the new distribution 
sns.distplot(train['SalePrice'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()

It can be clearly seen that our data is more close to normal distribution now.

**Removing Outliers as suggested by  Dean De Cook author of Ames house dataset**

 Dean De Cook author of Ames house dataset recommended the removal of some 5 outliers representing unsual sales in GrLivArea greater than 4000 square feet. Ref: www.amstat.org/publications/jse/v19n3/decock.pdf

In [None]:
train.iloc[np.where(train.GrLivArea > 4000)]

In [None]:
test.iloc[np.where(test.GrLivArea > 4000)]

In [None]:
plt.grid()
scatter = sns.regplot(x='GrLivArea', y='SalePrice', fit_reg =False, data=train)

2 of the houses are outliers, large houses prized relatively low, while the 2 on top in the scatter are very large houses with commensurate sales. The 2 outliers in the train set will be removed.

In [None]:
# dropping the outliers in the train set
train = train.drop(train[train['Id'] == 524].index)
train = train.drop(train[train['Id'] == 1299].index)


**Combining test and train data**

to do one hot encoding of categorical features we are combining train and test data

In [None]:
ntrain = train.shape[0]
ntest = test.shape[0]

In [None]:
# combining train and test values
data = pd.concat((train, test)).reset_index(drop=True)
train_y = pd.DataFrame(train.SalePrice)
ID = data['Id']
data.drop('Id', axis = 1, inplace = True)
data.shape

In [None]:
data = data.drop(['SalePrice'], axis = 1)

In [None]:
data.head()

**Now lets handle missing values**

In [None]:
# determine the threshold for missing values
def percent_missing(df):
    data = pd.DataFrame(df)
    df_cols = list(pd.DataFrame(data))
    dict_x = {}
    for i in range(0, len(df_cols)):
        dict_x.update({df_cols[i]: round(data[df_cols[i]].isnull().mean()*100,2)})
    
    return dict_x

missing = percent_missing(data)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:20]

In [None]:
# Handle missing values for features where median/mean or most common value doesn't make sense

# Alley : data description says NA means "no alley access"
data.loc[:, "Alley"] = data.loc[:, "Alley"].fillna("None")
# BedroomAbvGr : NA most likely means 0
data.loc[:, "BedroomAbvGr"] = data.loc[:, "BedroomAbvGr"].fillna(0)
# BsmtQual etc : data description says NA for basement features is "no basement"
data.loc[:, "BsmtQual"] = data.loc[:, "BsmtQual"].fillna("No")
data.loc[:, "BsmtCond"] = data.loc[:, "BsmtCond"].fillna("No")
data.loc[:, "BsmtExposure"] = data.loc[:, "BsmtExposure"].fillna("No")
data.loc[:, "BsmtFinType1"] = data.loc[:, "BsmtFinType1"].fillna("No")
data.loc[:, "BsmtFinType2"] = data.loc[:, "BsmtFinType2"].fillna("No")
data.loc[:, "BsmtFullBath"] = data.loc[:, "BsmtFullBath"].fillna(0)
data.loc[:, "BsmtHalfBath"] = data.loc[:, "BsmtHalfBath"].fillna(0)
data.loc[:, "BsmtUnfSF"] = data.loc[:, "BsmtUnfSF"].fillna(0)
data.loc[:, "BsmtFinSF1"] = data.loc[:, "BsmtUnfSF"].fillna(0)
data.loc[:, "BsmtFinSF2"] = data.loc[:, "BsmtUnfSF"].fillna(0)
data.loc[:, "BsmtFinSF2"] = data.loc[:, "BsmtUnfSF"].fillna(0)
data.loc[:, "TotalBsmtSF"] = data.loc[:, "BsmtUnfSF"].fillna(0)
# CentralAir : NA most likely means No
data.loc[:, "CentralAir"] = data.loc[:, "CentralAir"].fillna("N")
# Condition : NA most likely means Normal
data.loc[:, "Condition1"] = data.loc[:, "Condition1"].fillna("Norm")
data.loc[:, "Condition2"] = data.loc[:, "Condition2"].fillna("Norm")
# EnclosedPorch : NA most likely means no enclosed porch
data.loc[:, "EnclosedPorch"] = data.loc[:, "EnclosedPorch"].fillna(0)
# External stuff : NA most likely means average
data.loc[:, "ExterCond"] = data.loc[:, "ExterCond"].fillna("TA")
data.loc[:, "ExterQual"] = data.loc[:, "ExterQual"].fillna("TA")
data.loc[:, "Exterior1st"] = data.loc[:, "Exterior1st"].fillna("None")
data.loc[:, "Exterior2nd"] = data.loc[:, "Exterior2nd"].fillna("None")
# Fence : data description says NA means "no fence"
data.loc[:, "Fence"] = data.loc[:, "Fence"].fillna("No")
# FireplaceQu : data description says NA means "no fireplace"
data.loc[:, "FireplaceQu"] = data.loc[:, "FireplaceQu"].fillna("No")
data.loc[:, "Fireplaces"] = data.loc[:, "Fireplaces"].fillna(0)
# Functional : data description says NA means typical
data.loc[:, "Functional"] = data.loc[:, "Functional"].fillna("Typ")
# GarageType etc : data description says NA for garage features is "no garage"
data.loc[:, "GarageType"] = data.loc[:, "GarageType"].fillna("No")
data.loc[:, "GarageFinish"] = data.loc[:, "GarageFinish"].fillna("No")
data.loc[:, "GarageQual"] = data.loc[:, "GarageQual"].fillna("No")
data.loc[:, "GarageCond"] = data.loc[:, "GarageCond"].fillna("No")
data.loc[:, "GarageYrBlt"] = data.loc[:, "GarageYrBlt"].fillna(0)
data.loc[:, "GarageArea"] = data.loc[:, "GarageArea"].fillna(0)
data.loc[:, "GarageCars"] = data.loc[:, "GarageCars"].fillna(0)
# HalfBath : NA most likely means no half baths above grade
data.loc[:, "HalfBath"] = data.loc[:, "HalfBath"].fillna(0)
# HeatingQC : NA most likely means typical
data.loc[:, "HeatingQC"] = data.loc[:, "HeatingQC"].fillna("TA")
# KitchenAbvGr : NA most likely means 0
data.loc[:, "KitchenAbvGr"] = data.loc[:, "KitchenAbvGr"].fillna(0)
# KitchenQual : NA most likely means typical
data.loc[:, "KitchenQual"] = data.loc[:, "KitchenQual"].fillna("TA")
# LotFrontage : NA most likely means no lot frontage
data.loc[:, "LotFrontage"] = data.loc[:, "LotFrontage"].fillna(0)
# LotShape : NA most likely means regular
data.loc[:, "LotShape"] = data.loc[:, "LotShape"].fillna("Reg")
# MasVnrType : NA most likely means no veneer
data.loc[:, "MasVnrType"] = data.loc[:, "MasVnrType"].fillna("None")
data.loc[:, "MasVnrArea"] = data.loc[:, "MasVnrArea"].fillna(0)
# MiscFeature : data description says NA means "no misc feature"
data.loc[:, "MiscFeature"] = data.loc[:, "MiscFeature"].fillna("No")
data.loc[:, "MiscVal"] = data.loc[:, "MiscVal"].fillna(0)
# OpenPorchSF : NA most likely means no open porch
data.loc[:, "OpenPorchSF"] = data.loc[:, "OpenPorchSF"].fillna(0)
# PavedDrive : NA most likely means not paved
data.loc[:, "PavedDrive"] = data.loc[:, "PavedDrive"].fillna("N")
# PoolQC : data description says NA means "no pool"
data.loc[:, "PoolQC"] = data.loc[:, "PoolQC"].fillna("No")
data.loc[:, "PoolArea"] = data.loc[:, "PoolArea"].fillna(0)
# SaleCondition : NA most likely means normal sale
data.loc[:, "SaleCondition"] = data.loc[:, "SaleCondition"].fillna("Normal")
# ScreenPorch : NA most likely means no screen porch
data.loc[:, "ScreenPorch"] = data.loc[:, "ScreenPorch"].fillna(0)
# TotRmsAbvGrd : NA most likely means 0
data.loc[:, "TotRmsAbvGrd"] = data.loc[:, "TotRmsAbvGrd"].fillna(0)
# Utilities : NA most likely means all public utilities
data.loc[:, "Utilities"] = data.loc[:, "Utilities"].fillna("AllPub")
# WoodDeckSF : NA most likely means no wood deck
data.loc[:, "WoodDeckSF"] = data.loc[:, "WoodDeckSF"].fillna(0)
# Electrical: NA not explicitly assigned in the data description will be filled with the mode in the Neighborhood
data['Electrical'] = data.groupby(['Neighborhood','MSSubClass' ])['Electrical'].apply(lambda x: x.fillna(x.value_counts().index[0]))
# MSZoning: NA not explicitly assigned in the data description will be filled with the mode
data['MSZoning'] = data['MSZoning'].fillna(data['MSZoning'].mode()[0])
# SaleType : NA most likely means Other
data.loc[:, "SaleType"] = data.loc[:, "SaleType"].fillna("Oth")

In [None]:
# Let's make sure we handled all the missing values
missing = percent_missing(train)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:10]

**Feature Extraction**

Feature Extraction:
1.)Create new features from existing features. 
2.)There are 2 types of categorical variables, nominal and ordinal. The ordinal variables show some rank and will be encoded with numeric values.

In [None]:
#Correlation:  Its the most basic way to find relation between any two quantities.
corr = train.corr()

plt.figure(figsize=(15,15))

sns.heatmap(corr,vmax=0.9,square=True)
plt.show()


**Top co_related features to sales price are: GrLivArea(Highly correlated),OverallQual(Higly corelated), TotRmsAbvGrd, GarageYrBlt and YearBuilt, 1stFlrSF and TotalBsmtSF,GarageArea and GarageCars etc.**

In [None]:
# correlation
corr = train.corr()
# sort in descending order
corr_top = corr['SalePrice'].abs().sort_values(ascending=False)[:15]#getting top 15 features
#.abs() is necesasry to get both strong positive and strong negative correlation
top_features = corr_top.index[1:]

corr_top

In [None]:
# Top features and SalePrice
fig,ax=plt.subplots(nrows=14,ncols=1,figsize=(6,30))
for i in range(len(top_features)):    

    ax[i].scatter(x=train[top_features[i]], y=train['SalePrice'])
    ax[i].set_xlabel('%s'%(top_features[i]))
    ax[i].set_ylabel('SalePrice')

plt.tight_layout()
plt.savefig('./Top_featuresvsSalePrice.jpg',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
numeric_cols = ['GrLivArea','GarageArea','TotalBsmtSF','1stFlrSF','YearBuilt']
nominal_cols = ['OverallQual','GarageCars','FullBath','TotRmsAbvGrd']

In [None]:
# impute ordinal data with numeric values
data['KitchenQual'].replace(['Ex','Gd','TA','Fa'],[4,3,2,1],inplace=True)
data['FireplaceQu'].replace(['Ex','Gd','TA','Fa','Po', 'No'],[6,5,4,3,2,1],inplace=True)
data['GarageQual'].replace(['Ex','Gd','TA','Fa','Po','No'],[6,5,4,3,2,1],inplace=True)
data['GarageCond'].replace(['Ex','Gd','TA','Fa','Po','No'],[6,5,4,3,2,1],inplace=True)
data['PoolQC'].replace(['Ex','Gd','TA','Fa','No'],[5,4,3,2,1],inplace=True)
data['ExterQual'].replace(['Ex','Gd','TA','Fa'],[4,3,2,1],inplace=True)
data['ExterCond'].replace(['Ex','Gd','TA','Fa','Po'],[5,4,3,2,1],inplace=True)
data['BsmtQual'].replace(['Ex','Gd','TA','Fa','Po','No'],[6,5,4,3,2,1],inplace=True)
data['BsmtCond'].replace(['Ex','Gd','TA','Fa','Po','No'],[6,5,4,3,2,1],inplace=True)
data['BsmtExposure'].replace(['Gd','Av','Mn','No','None'],[5,4,3,2,1],inplace=True)
data['HeatingQC'].replace(['Ex','Gd','TA','Fa','Po'],[5,4,3,2,1],inplace=True)
# transform discrete features to  categorical feature
data['MSSubClass'] = data['MSSubClass'].astype(str)
data['YrSold'] = data['YrSold'].astype(str)   
data['MoSold'] = data['MoSold'].astype(str)

some features which are related to each other can be combined together

In [None]:
# combinations of old features
data['GarageScale'] = data['GarageCars'] * data['GarageArea']

data['GarageOrdinal'] = data['GarageQual'] + data['GarageCond']
data['AllPorch'] = data['OpenPorchSF'] + data['EnclosedPorch'] + data['3SsnPorch'] + data['ScreenPorch']
data['ExterOrdinal'] = data['ExterQual'] + data['ExterCond']
data['KitchenCombined'] = data['KitchenQual'] * data['KitchenAbvGr']
data['FireplaceCombined'] = data['FireplaceQu'] * data['Fireplaces']
data['BsmtOrdinal'] = data['BsmtQual'] + data['BsmtCond']
data['BsmtFinishedAll'] = data['BsmtFinSF1'] + data['BsmtFinSF2']
data['AllFlrSF'] = data['1stFlrSF'] + data['2ndFlrSF']
data['OverallCombined'] = data['OverallQual'] + data['OverallCond']
data['TotalFullBath'] = data['BsmtFullBath'] +  + data["FullBath"] 
data['TotalHalfBath'] = data["HalfBath"] + data['BsmtHalfBath']
data['TotalSF'] = data['AllFlrSF'] + data['TotalBsmtSF']
data['YrBltAndRemod'] = data["YearRemodAdd"] + data['YearBuilt']
data=data.drop(['GarageCars','GarageArea','GarageQual','GarageCond','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','ExterQual','ExterCond','KitchenQual','KitchenAbvGr','FireplaceQu','Fireplaces','BsmtQual','BsmtCond','BsmtFinSF1',
               'BsmtFinSF2','1stFlrSF','2ndFlrSF','OverallQual','OverallCond','BsmtFullBath',"FullBath","HalfBath",'BsmtHalfBath','AllFlrSF','TotalBsmtSF',
               "YearRemodAdd",'YearBuilt'],axis=1)



**Skewed Features**

In [None]:
numeric_feats = data.dtypes[data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)

Box Cox Transformation of (highly) skewed features

We use the scipy function boxcox1p which computes the Box-Cox transformation of  1+x .

Note that setting  λ=0  is equivalent to log1p used above for the target variable.

In [None]:
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    data[feat] = boxcox1p(data[feat], lam)
    
#all_data[skewed_features] = np.log1p(all_data[skewed_features])

**Categorical encoding** Lets do one hot encoding of categorical features now

In [None]:
data = pd.get_dummies(data)
print(data.shape)

In [None]:
data.head()

In [None]:
# Let's make sure we handled all the missing values
missing = percent_missing(data)
df_miss = sorted(missing.items(), key=lambda x: x[1], reverse=True)
print('Percent of missing data')
df_miss[0:5]

In [None]:
#Features with zero values that can be described as almost 100% can cause overfitting and will be dropped

In [None]:
overfit = []
for i in data.columns:
    counts = data[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(data) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
data = data.drop(overfit, axis=1)

In [None]:
train_x = pd.concat([ID[:ntrain], data[:ntrain]], axis = 1)
test_x = pd.concat([ID[ntrain:], data[ntrain:]], axis = 1)
print("Train:", train_x.shape)
print("Test:",test_x.shape)

In [None]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()
rs.fit(train_x)
train_x = rs.transform(train_x)
# train_x = pd.DataFrame(train_x, columns = index)

In [None]:
from sklearn.metrics import make_scorer, mean_squared_error

def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions) ** 0.5
RMSE = make_scorer(mean_squared_error_, greater_is_better=False)

**1.) LINEAR REGRESSION**

In [None]:
#LETS MAKE TRAINING AND VALIDATION SET

In [None]:
#splitting the dataset as training and Validation dataset
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size = 0.2)


In [None]:

#building the model
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(X_train, y_train)

#Accuracy
print("R-Squared Value for Training Set: {:.3f}".format(linreg.score(X_train, y_train)))
print("R-Squared Value for Validation Set: {:.3f}".format(linreg.score(X_val, y_val)))

In [None]:
print('The train log RMSE is {:.4f}'.format(mean_squared_error_(linreg.predict(X_train), y_train)))
print('The validation Log RMSE is {:.4f}'.format(mean_squared_error_(linreg.predict(X_val), y_val)))

**2.) Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(ridge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'.format(ridge.score(X_val, y_val)))

In [None]:
print('The train log RMSE is {:.4f}'.format(mean_squared_error_(ridge.predict(X_train), y_train)))
print('The validation Log RMSE is {:.4f}'.format(mean_squared_error_(ridge.predict(X_val), y_val)))

**3.)LASSO Regression**

In [None]:
from sklearn.linear_model import Lasso

lasso = Lasso(max_iter = 1000)
lasso.fit(X_train, y_train)

print('R-squared score (training): {:.3f}'.format(lasso.score(X_train, y_train)))
print('R-squared score (validation): {:.3f}'.format(lasso.score(X_val, y_val)))

In [None]:
print('The train log RMSE is {:.4f}'.format(mean_squared_error_(lasso.predict(X_train), y_train)))
print('The validation Log RMSE is {:.4f}'.format(mean_squared_error_(lasso.predict(X_val), y_val)))

**4.)Random Forest Regression**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from time import time

clf = RandomForestRegressor()

param_grid = {'min_samples_split': [2, 10, 50, 100],
              'min_samples_leaf': [1, 10, 50, 100],
              'n_estimators': [100, 500, 1000],
              'max_depth': [1, 5, 10, None]
             }

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, scoring= 'neg_mean_squared_error', n_jobs = 6, verbose=True)
start = time()
grid_search.fit(X_train, y_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))

In [None]:
grid_search.best_score_, grid_search.best_params_

In [None]:
rf = RandomForestRegressor(max_depth = 10, min_samples_split= 2, min_samples_leaf= 1)

In [None]:
rf.fit(X_train, y_train)

In [None]:
print('R-squared score (training): {:.3f}'.format(rf.score(X_train, y_train)))
print('R-squared score (validation): {:.3f}'.format(rf.score(X_val, y_val)))

In [None]:
print('The train log RMSE is {:.4f}'.format(mean_squared_error_(rf.predict(X_train), y_train)))
print('The validation Log RMSE is {:.4f}'.format(mean_squared_error_(rf.predict(X_val), y_val)))

In [None]:
y_pred = rf.predict(test_x)

In [None]:
y_pred = np.expm1(y_pred)
y_pred

**5.)Gradient Boost Regression**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

clf = GradientBoostingRegressor()

param_grid = {'learning_rate': [0.1, 0.05, 0.01],
              'max_depth': [3, 7, 10],
              'min_samples_leaf': [1, 20, 50, 100],
              "min_samples_split": [2, 10, 25, 50],
              "n_estimators": [1000],
              "subsample": [0.6, 0.8, 1.0]
             }

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=3, scoring= 'neg_mean_squared_error', n_jobs = 6, verbose=True)
start = time()
grid_search.fit(X_train, y_train['SalePrice'])

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.cv_results_['params'])))

In [None]:
print('The train Log RMSE is {:.4f}'.format(mean_squared_error_(gbr.predict(train_x), train_y)))

**6.) Polynomial regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline


In [None]:
model_new=Pipeline([("poly",PolynomialFeatures(degree=2)),("linear",LinearRegression(fit_intercept=False))])

In [None]:
model_new.fit(X_train,y_train)

In [None]:
#Accuracy
print("R-Squared Value for Training Set: {:.3f}".format(model_new.score(X_train, y_train)))
print("R-Squared Value for Validation Set: {:.3f}".format(model_new.score(X_val, y_val)))

In [None]:
y_pred_new = model_new.predict(test_x)
y_pred_new

In [None]:
my_submission = pd.DataFrame({'Id': test_x.Id, 'SalePrice': y_pred_new[:,0]})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)

In [None]:
my_submission

In [None]:
my_submission