In [201]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer

In [4]:
# Load the training data
prices_data = pd.read_csv('train.csv')
prices_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Data Exploration

In [5]:
'''
  Display some imformation to know the features' datatype
  and to detect the features that have NULL values.
'''
prices_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [10]:
# Select only integer features and ignore categorical ones
int_prices_data = prices_data.select_dtypes(np.int)
int_prices_data.head()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  int_prices_data = prices_data.select_dtypes(np.int)


Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [11]:
int_prices_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Id             1460 non-null   int64
 1   MSSubClass     1460 non-null   int64
 2   LotArea        1460 non-null   int64
 3   OverallQual    1460 non-null   int64
 4   OverallCond    1460 non-null   int64
 5   YearBuilt      1460 non-null   int64
 6   YearRemodAdd   1460 non-null   int64
 7   BsmtFinSF1     1460 non-null   int64
 8   BsmtFinSF2     1460 non-null   int64
 9   BsmtUnfSF      1460 non-null   int64
 10  TotalBsmtSF    1460 non-null   int64
 11  1stFlrSF       1460 non-null   int64
 12  2ndFlrSF       1460 non-null   int64
 13  LowQualFinSF   1460 non-null   int64
 14  GrLivArea      1460 non-null   int64
 15  BsmtFullBath   1460 non-null   int64
 16  BsmtHalfBath   1460 non-null   int64
 17  FullBath       1460 non-null   int64
 18  HalfBath       1460 non-null   int64
 19  Bedroo

In [14]:
# Select the main features
main_features = ['LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
'1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',  'BsmtFullBath','BsmtHalfBath','FullBath',
'HalfBath','BedroomAbvGr','TotRmsAbvGrd','GarageArea',  'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'SalePrice']

In [18]:
int_prices_data[main_features].describe()

Unnamed: 0,LotArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,...,BedroomAbvGr,TotRmsAbvGrd,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,10516.828082,443.639726,46.549315,567.240411,1057.429452,1162.626712,346.992466,5.844521,1515.463699,0.425342,...,2.866438,6.517808,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,180921.19589
std,9981.264932,456.098091,161.319273,441.866955,438.705324,386.587738,436.528436,48.623081,525.480383,0.518911,...,0.815778,1.625393,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,79442.502883
min,1300.0,0.0,0.0,0.0,0.0,334.0,0.0,0.0,334.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34900.0
25%,7553.5,0.0,0.0,223.0,795.75,882.0,0.0,0.0,1129.5,0.0,...,2.0,5.0,334.5,0.0,0.0,0.0,0.0,0.0,0.0,129975.0
50%,9478.5,383.5,0.0,477.5,991.5,1087.0,0.0,0.0,1464.0,0.0,...,3.0,6.0,480.0,0.0,25.0,0.0,0.0,0.0,0.0,163000.0
75%,11601.5,712.25,0.0,808.0,1298.25,1391.25,728.0,0.0,1776.75,1.0,...,3.0,7.0,576.0,168.0,68.0,0.0,0.0,0.0,0.0,214000.0
max,215245.0,5644.0,1474.0,2336.0,6110.0,4692.0,2065.0,572.0,5642.0,3.0,...,8.0,14.0,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,755000.0


In [161]:
X = int_prices_data[main_features[:-1]]
y = int_prices_data[main_features[-1]]

In [202]:
# Scale the data using StandardScaler
scaler = StandardScaler()
scaler_X = scaler.fit_transform(X)

In [212]:
# Divide the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(scaler_X, y, test_size=0.2)

## Building the model

In [213]:
# train the LinearRegression model
LR = LinearRegression()
LR.fit(x_train, y_train)
preds = LR.predict(x_test)

## Evaluation

In [225]:
print('mean_squared_error:', mean_squared_error(preds, y_test))
print('mean_absolute_error:', mean_absolute_error(preds, y_test))

mean_squared_error: 1688799049.857154
mean_absolute_error: 26680.441705767862


In [179]:
test_files = pd.read_csv('test.csv')

In [None]:
int_test_files = test_files.select_dtypes([np.int, np.float])

In [194]:
int_test_files.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   LotFrontage    1232 non-null   float64
 3   LotArea        1459 non-null   int64  
 4   OverallQual    1459 non-null   int64  
 5   OverallCond    1459 non-null   int64  
 6   YearBuilt      1459 non-null   int64  
 7   YearRemodAdd   1459 non-null   int64  
 8   MasVnrArea     1444 non-null   float64
 9   BsmtFinSF1     1458 non-null   float64
 10  BsmtFinSF2     1458 non-null   float64
 11  BsmtUnfSF      1458 non-null   float64
 12  TotalBsmtSF    1458 non-null   float64
 13  1stFlrSF       1459 non-null   int64  
 14  2ndFlrSF       1459 non-null   int64  
 15  LowQualFinSF   1459 non-null   int64  
 16  GrLivArea      1459 non-null   int64  
 17  BsmtFullBath   1457 non-null   float64
 18  BsmtHalf

In [195]:
imputer = SimpleImputer(strategy='mean')
imputed_X = imputer.fit_transform(int_test_files[main_features[:-1]])

In [None]:
scaler_X = scaler.transform(imputed_X)
test_preds = LR.predict(scaler_X)

In [222]:
submitted_data = {'Id':list(range(1461, 2920)), 'SalePrice':test_preds.flatten()}
pd.DataFrame(submitted_data).to_csv('sample_submission.csv', index=False)