In [1]:
import pandas as pd

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['Id']

In [2]:
obj = train.isnull().sum()
for key,value in obj.items():
    print(key,"\t",value)

Id 	 0
MSSubClass 	 0
MSZoning 	 0
LotFrontage 	 259
LotArea 	 0
Street 	 0
Alley 	 1369
LotShape 	 0
LandContour 	 0
Utilities 	 0
LotConfig 	 0
LandSlope 	 0
Neighborhood 	 0
Condition1 	 0
Condition2 	 0
BldgType 	 0
HouseStyle 	 0
OverallQual 	 0
OverallCond 	 0
YearBuilt 	 0
YearRemodAdd 	 0
RoofStyle 	 0
RoofMatl 	 0
Exterior1st 	 0
Exterior2nd 	 0
MasVnrType 	 872
MasVnrArea 	 8
ExterQual 	 0
ExterCond 	 0
Foundation 	 0
BsmtQual 	 37
BsmtCond 	 37
BsmtExposure 	 38
BsmtFinType1 	 37
BsmtFinSF1 	 0
BsmtFinType2 	 38
BsmtFinSF2 	 0
BsmtUnfSF 	 0
TotalBsmtSF 	 0
Heating 	 0
HeatingQC 	 0
CentralAir 	 0
Electrical 	 1
1stFlrSF 	 0
2ndFlrSF 	 0
LowQualFinSF 	 0
GrLivArea 	 0
BsmtFullBath 	 0
BsmtHalfBath 	 0
FullBath 	 0
HalfBath 	 0
BedroomAbvGr 	 0
KitchenAbvGr 	 0
KitchenQual 	 0
TotRmsAbvGrd 	 0
Functional 	 0
Fireplaces 	 0
FireplaceQu 	 690
GarageType 	 81
GarageYrBlt 	 81
GarageFinish 	 81
GarageCars 	 0
GarageArea 	 0
GarageQual 	 81
GarageCond 	 81
PavedDrive 	 0
WoodDeckSF

In [3]:
from sklearn.preprocessing import LabelEncoder

def clean(data):
    # Removing cols
    data.drop(columns=['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'FireplaceQu', 'MasVnrType', 'Utilities', 'Street', 'RoofMatl'] ,inplace=True)
    
    # Label Encoding
    categorical_columns = data.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for col in categorical_columns:
        data[col] = le.fit_transform(data[col])
        
    # Fill missing values
    data = data.apply(lambda x: x.fillna(x.median()), axis=0)
    
    return data

train = clean(train)
test = clean(test)

In [4]:
from scipy import stats

z_threshold = 3
z_scores = stats.zscore(train)

outlier_rows = (z_scores > z_threshold).any(axis=1)

train = train[~outlier_rows]

In [5]:
correlation = train.corr()['SalePrice'].drop('SalePrice')

for key,value in correlation.items():
    print(key,"\t",value)

MSSubClass 	 -0.05519172261091853
MSZoning 	 -0.2722276988900108
LotFrontage 	 0.39946168254953485
LotArea 	 0.3774304616650363
LotShape 	 -0.23784203361820844
LandContour 	 0.025047934956784108
LotConfig 	 -0.04860021726331374
LandSlope 	 nan
Neighborhood 	 0.21316756999532543
Condition1 	 0.1920740539402635
Condition2 	 nan
BldgType 	 -0.06678433348230782
HouseStyle 	 0.19462595573645605
OverallQual 	 0.8195085469126724
OverallCond 	 -0.18778347335281534
YearBuilt 	 0.6502905220277794
YearRemodAdd 	 0.5690966110101872
RoofStyle 	 0.1413340238952136
Exterior1st 	 0.17285938743506668
Exterior2nd 	 0.18643337151677356
MasVnrArea 	 0.42645902175425954
ExterQual 	 -0.6731612173768157
ExterCond 	 0.1832086419369374
Foundation 	 0.5225608493924087
BsmtQual 	 -0.6515476505298372
BsmtCond 	 0.029431372304816256
BsmtExposure 	 -0.27285530198035757
BsmtFinType1 	 -0.04143026058996591
BsmtFinSF1 	 0.3373186971380997
BsmtFinType2 	 0.0660689885240764
BsmtFinSF2 	 -0.10297500878125743
BsmtUnfSF 	 

In [6]:
corr_matrix = train.corr()

price_corr = corr_matrix['SalePrice'].abs()

low_corr_cols = price_corr[price_corr < 0.3]

print(low_corr_cols)

MSSubClass       0.055192
MSZoning         0.272228
LotShape         0.237842
LandContour      0.025048
LotConfig        0.048600
Neighborhood     0.213168
Condition1       0.192074
BldgType         0.066784
HouseStyle       0.194626
OverallCond      0.187783
RoofStyle        0.141334
Exterior1st      0.172859
Exterior2nd      0.186433
ExterCond        0.183209
BsmtCond         0.029431
BsmtExposure     0.272855
BsmtFinType1     0.041430
BsmtFinType2     0.066069
BsmtFinSF2       0.102975
BsmtUnfSF        0.241377
Heating          0.051617
CentralAir       0.249368
Electrical       0.248606
2ndFlrSF         0.294470
LowQualFinSF     0.022091
BsmtFullBath     0.216368
HalfBath         0.259246
BedroomAbvGr     0.253685
Functional       0.147921
GarageQual       0.062716
GarageCond       0.047484
PavedDrive       0.260673
EnclosedPorch    0.199862
ScreenPorch      0.008405
MiscVal          0.077440
MoSold           0.075103
YrSold           0.014383
SaleType         0.055111
SaleConditio

In [7]:
low_corr_cols = price_corr[price_corr < 0.3].index

train.drop(columns=low_corr_cols, inplace=True)
test.drop(columns=low_corr_cols, inplace=True)

In [8]:
train.shape

(935, 32)

In [9]:
train_y = train['SalePrice']
train_x = train.drop('SalePrice', axis=1)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.15, random_state=42)

In [11]:
import xgboost as xgb

xg_reg = xgb.XGBRegressor(objective='reg:squarederror', colsample_bytree=0.3, learning_rate=0.1,
                          max_depth=5, alpha=10, n_estimators=10)

xg_reg.fit(train_x, train_y)

from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = xg_reg.predict(X_val)

print(np.sqrt(mean_squared_error(y_val, y_pred)))
print(r2_score(y_val, y_pred))

In [12]:
pred = xg_reg.predict(test)

df = pd.DataFrame({'Id': test_ids.values,
                   'SalePrice' : pred
                  })

df.to_csv('submission.csv', index=False)