In [32]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

In [33]:
data = pd.concat([train, test])
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


Now we are going to remove columns that have more than 750 null values since we are going to assume that they aren't strongly related to the SalePrice.

Then since we are going to use Linear Regression model which operates on numerical data, we have to convertthe string values into numerical.
For that purpose we are going to use pd.getdummies

Multiplying each column by 1 to map True/False Values into 1 and 0

In [34]:
data = data.dropna(axis = 1, thresh = 750)
data = pd.get_dummies(data)
data = data * 1.0
data = data.fillna(data.mean())
data

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1.0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915.0,160.0,21.0,1936.0,4.0,7.0,1970.0,1970.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,2916.0,160.0,21.0,1894.0,4.0,5.0,1970.0,1970.0,0.0,252.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,2917.0,20.0,160.0,20000.0,5.0,7.0,1960.0,1996.0,0.0,1224.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,2918.0,85.0,62.0,10441.0,5.0,5.0,1992.0,1992.0,0.0,337.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


Now we are going to drop features with strong corelation between them since that indicates that these features carry basically the same information and it would be a waste of time to consider them.

We are also going to drop features which are not corelated to the the output

In [35]:
# Dropping features with corelation between them
covarianceMatrix = data.corr()
list_of_features = [i for i in covarianceMatrix]
dropped_features = set()
for i in range(len(list_of_features)):
    for j in range(i+1, len(list_of_features)):
        feature1 = list_of_features[i]
        feature2 = list_of_features[j]
        if abs(covarianceMatrix[feature1][feature2]) > 0.77:
            dropped_features.add(feature1)
data = data.drop(dropped_features, axis = 1)


# Dropping features that are not corelated with output
not_correlated_output = [column for column in data if abs(data[column].corr(data['SalePrice'])) < 0.05]

data = data.drop(not_correlated_output, axis = 1)

Now it would be a good moment to check for outliers and remove them if there are any

In [38]:
# Separating sets since removing outliers would remove rows
train_processed = data.iloc[:1460]
test_processed = data.iloc[1460:]

# Defining a function that would return outliers that are not in given percentile
def outliers(x):
    quartile_1, quartile_3 = np.percentile(x, [25, 75])
    kept = quartile_3 - quartile_1
    lower_bound = quartile_1 - (kept * 1.5)
    upper_bound = quartile_3 - (kept * 1.5)
    return np.where((x > upper_bound) | (x < lower_bound))

train_no_outliers = train_processed

for column in train_processed:
    outliers_list = np.ndarray.tolist(outliers(train_processed[column])[0])
    train_no_outliers = train_processed.drop(outliers_list)
    
train_no_outliers = train_processed

In [40]:
X = train_no_outliers.drop('SalePrice', axis = 1)
y = np.log1p(train_no_outliers['SalePrice'])
lin_reg = LinearRegression().fit(X, y)

In [41]:
test_processed = test_processed.drop('SalePrice', axis = 1)
prediction = np.expm1(lin_reg.predict(test_processed))

In [44]:
submission = pd.DataFrame()
submission['Id'] = test['Id']
submission['SalePrice'] = prediction
submission.to_csv('submission.csv', index = False)
submission

Unnamed: 0,Id,SalePrice
0,1461,123732.506835
1,1462,153526.093386
2,1463,172578.902584
3,1464,194047.237376
4,1465,211400.714877
...,...,...
1454,2915,78411.387411
1455,2916,82575.215199
1456,2917,163320.101991
1457,2918,119897.978569
