In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split  

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold,cross_val_score

from sklearn.metrics import mean_squared_error

# Importing Data

In [2]:
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
print(train.shape)

(1460, 81)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

# Missing Values

In [5]:
train_na = (train.isnull().sum() / len(train)) * 100      
train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :train_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio
PoolQC,99.520548
MiscFeature,96.30137
Alley,93.767123
Fence,80.753425
FireplaceQu,47.260274
LotFrontage,17.739726
GarageType,5.547945
GarageYrBlt,5.547945
GarageFinish,5.547945
GarageQual,5.547945


**Drop the columns with more than 50% missing values**

In [6]:
train.drop(['PoolQC','MiscFeature','Alley','Fence','Id'],axis=1,inplace=True)
train.shape

(1460, 76)

In [7]:
X = train.drop(['SalePrice'], axis=1)
y = train[['SalePrice']].values.ravel()

# Data processing 

In [8]:
categorical_features = X.select_dtypes(include="object").columns   
integer_features = X.select_dtypes(exclude="object").columns    

In [9]:
for c in categorical_features:      
    lbl = LabelEncoder() 
    lbl.fit(list(X[c].values)) 
    X[c] = lbl.transform(list(X[c].values))

integer_transformer = Pipeline(steps = [
   ('imputer', SimpleImputer(strategy = 'mean')),   
   ('scaler', StandardScaler())])                  

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))  
])

In [10]:
preprocessor = ColumnTransformer(                    
   transformers=[
       ('ints', integer_transformer, integer_features),
       ('cat', categorical_transformer, categorical_features)])

In [11]:
X = preprocessor.fit_transform(X)

# Data splitting

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.20, random_state=42,shuffle=False)

In [13]:
print('X_train:',X_train.shape,'y_train:',y_train.shape,'\nX_valid :',X_valid.shape,'y_valid:',y_valid.shape)

X_train: (1168, 75) y_train: (1168,) 
X_valid : (292, 75) y_valid: (292,)


In [14]:
def display_scores(scores):
   print("Scores:", scores)
   print("Mean:", scores.mean())
   print("Standard deviation:", scores.std())

# Implementing Gradient-Boosting-Regressor Model

In [15]:
kf = KFold(n_splits =10, shuffle = True, random_state = 100) 

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)


In [16]:
GBoost.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', min_samples_leaf=15,
                          min_samples_split=10, n_estimators=3000,
                          random_state=5)

In [17]:
scores = cross_val_score(GBoost,X_train, y_train, scoring="neg_mean_squared_error", cv = kf)   
scores = np.sqrt(-scores)

display_scores(scores)

Scores: [23678.64158229 17761.32887507 23687.20215773 25953.04375067
 17758.45469889 40607.11186402 31007.00761562 18897.34671982
 33905.36521331 22736.56396083]
Mean: 25599.206643825055
Standard deviation: 7120.131327371225


In [18]:
y_prediction = GBoost.predict(X_valid)
final_mse = mean_squared_error(y_valid,y_prediction)
final_rmse = np.sqrt(final_mse)
final_rmse

33080.358998846976

The performance will usually be slightly worse than what you measured using cross-validation

# Submission

In [19]:
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv') 
test_ID = test['Id'] 
test.drop(['PoolQC','MiscFeature','Alley','Fence','Id'],axis=1,inplace=True)  


for c in categorical_features:
    lbl = LabelEncoder() 
    lbl.fit(list(test[c].values)) 
    test[c] = lbl.transform(list(test[c].values))

X_test = preprocessor.fit_transform(test)   

prediction = GBoost.predict(X_test)

print(prediction)

[116739.4700387  161770.22593902 180050.6048024  ... 176212.50725458
 125159.96361017 213940.17153616]


In [20]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = prediction
sub.to_csv('submission.csv',index=False)
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,116739.470039
1,1462,161770.225939
2,1463,180050.604802
3,1464,196805.720604
4,1465,170342.189383
