In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("housing_prices.csv")

In [3]:
df.shape

(1460, 81)

In [4]:
df.isnull().sum()[df.isnull().sum() > 0]

LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [5]:
df[df.isnull().sum()[df.isnull().sum() < 100].keys()].isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 74, dtype: int64

In [6]:
df.dropna(subset=df.isnull().sum()[df.isnull().sum() < 100].keys(), inplace=True)

In [7]:
df.drop(columns=df.isnull().sum()[df.isnull().sum() > 0].keys(), inplace=True)

In [8]:
df.isnull().sum()[df.isnull().sum() > 0]

Series([], dtype: int64)

In [9]:
df.dtypes

Id                int64
MSSubClass        int64
MSZoning         object
LotArea           int64
Street           object
                  ...  
MoSold            int64
YrSold            int64
SaleType         object
SaleCondition    object
SalePrice         int64
Length: 74, dtype: object

In [10]:
num_df = df.select_dtypes(include=['int64', 'float64'])

In [11]:
num_df.dtypes

Id                 int64
MSSubClass         int64
LotArea            int64
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
MasVnrArea       float64
BsmtFinSF1         int64
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
1stFlrSF           int64
2ndFlrSF           int64
LowQualFinSF       int64
GrLivArea          int64
BsmtFullBath       int64
BsmtHalfBath       int64
FullBath           int64
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
TotRmsAbvGrd       int64
Fireplaces         int64
GarageYrBlt      float64
GarageCars         int64
GarageArea         int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
PoolArea           int64
MiscVal            int64
MoSold             int64
YrSold             int64
SalePrice          int64
dtype: object

In [12]:
num_df.head()

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,8450,7,5,2003,2003,196.0,706,0,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,9600,6,8,1976,1976,0.0,978,0,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,11250,7,5,2001,2002,162.0,486,0,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,9550,7,5,1915,1970,0.0,216,0,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,14260,8,5,2000,2000,350.0,655,0,...,192,84,0,0,0,0,0,12,2008,250000


In [13]:
num_df.drop(columns=['Id'], inplace=True)
num_df.head()

Unnamed: 0,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,60,8450,7,5,2003,2003,196.0,706,0,150,...,0,61,0,0,0,0,0,2,2008,208500
1,20,9600,6,8,1976,1976,0.0,978,0,284,...,298,0,0,0,0,0,0,5,2007,181500
2,60,11250,7,5,2001,2002,162.0,486,0,434,...,0,42,0,0,0,0,0,9,2008,223500
3,70,9550,7,5,1915,1970,0.0,216,0,540,...,0,35,272,0,0,0,0,2,2006,140000
4,60,14260,8,5,2000,2000,350.0,655,0,490,...,192,84,0,0,0,0,0,12,2008,250000


In [14]:
features = num_df.drop(columns = 'SalePrice', axis=1)
prices = num_df['SalePrice']

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
from sklearn import linear_model

In [17]:
total = 0
maxScore = 0
for i in range(200):
  X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.3)

  linear_reg = linear_model.LinearRegression()

  linear_reg = linear_reg.fit(X_train, y_train)

  temp = linear_reg.score(X_test, y_test)
  total = total + temp
  if (temp > maxScore): 
    maxScore = temp
    bestCoef = linear_reg.coef_

print(total/200)
print(maxScore)

0.7413215125618581
0.8631594868345676


In [18]:
for i in range(X_train.keys().size):
    print(X_train.keys()[i], bestCoef[i])

MSSubClass -159.96850919709436
LotArea 0.33932531784203634
OverallQual 19768.583596392244
OverallCond 5095.017464995656
YearBuilt 287.0135107664449
YearRemodAdd 145.45151914776926
MasVnrArea 30.443711248540883
BsmtFinSF1 7.1967334104846055
BsmtFinSF2 -0.2897085854739359
BsmtUnfSF -0.5911808111240775
TotalBsmtSF 6.3158440116984815
1stFlrSF 21.486952396033576
2ndFlrSF 27.78395981909588
LowQualFinSF -27.129768685124404
GrLivArea 22.141143533466707
BsmtFullBath 10832.872824451817
BsmtHalfBath 4199.825198210636
FullBath 850.9218459380523
HalfBath -4405.33307871348
BedroomAbvGr -8995.537668318633
KitchenAbvGr -27103.580559986174
TotRmsAbvGrd 5036.578608208891
Fireplaces 2512.299506599916
GarageYrBlt -33.16733464798867
GarageCars 16423.661523980318
GarageArea 6.4146286872486264
WoodDeckSF 22.76835263249753
OpenPorchSF -6.4403524009408075
EnclosedPorch -3.8476897944441357
3SsnPorch 28.67376808906927
ScreenPorch 47.849880347997015
PoolArea -12.643247947541319
MiscVal -0.6012713679029287
MoSold 

In [19]:
max(bestCoef)

19768.583596392244

In [20]:
dummies_df = pd.get_dummies(df, columns=df.select_dtypes(exclude=['number']).columns, dtype=float)

In [21]:
dummies_df.shape

(1338, 262)

In [22]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [23]:
from sklearn.preprocessing import LabelEncoder

In [24]:
label_encoder = LabelEncoder()

In [25]:
encoded_df = df.copy()

In [26]:
for col in encoded_df.select_dtypes(exclude=['number']).columns:
    encoded_df[col] = label_encoder.fit_transform(encoded_df[col])

In [27]:
encoded_df.dtypes

Id               int64
MSSubClass       int64
MSZoning         int32
LotArea          int64
Street           int32
                 ...  
MoSold           int64
YrSold           int64
SaleType         int32
SaleCondition    int32
SalePrice        int64
Length: 74, dtype: object

In [28]:
encoded_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,8450,1,3,3,0,4,0,...,0,0,0,0,0,2,2008,8,4,208500
1,2,20,3,9600,1,3,3,0,2,0,...,0,0,0,0,0,5,2007,8,4,181500
2,3,60,3,11250,1,0,3,0,4,0,...,0,0,0,0,0,9,2008,8,4,223500
3,4,70,3,9550,1,0,3,0,0,0,...,272,0,0,0,0,2,2006,8,0,140000
4,5,60,3,14260,1,0,3,0,2,0,...,0,0,0,0,0,12,2008,8,4,250000


In [29]:
encoded_X = encoded_df.drop(columns = 'SalePrice', axis=1)
encoded_Y = encoded_df['SalePrice']

In [30]:
total = 0
maxScore = 0
bestCoef = 0
for i in range(200):
  eX_train, eX_test, ey_train, ey_test = train_test_split(encoded_X, encoded_Y, test_size = 0.3)

  linear_reg = linear_model.LinearRegression()

  linear_reg = linear_reg.fit(eX_train, ey_train)

  temp = linear_reg.score(eX_test, ey_test)
  total = total + temp
  if (temp > maxScore): 
    maxScore = temp
    bestCoef = linear_reg.coef_

print(total/200)
print(maxScore)

0.7610563980789419
0.8761154526253585


In [31]:
for i in range(encoded_X.keys().size):
    print(encoded_X.keys()[i], bestCoef[i])

Id -2.108585097979246
MSSubClass -149.33897138403566
MSZoning 373.9400915906502
LotArea 0.17946577509057704
Street 48453.745822483346
LotShape -1433.9328864141562
LandContour 7117.975478605223
Utilities -60094.81703752222
LotConfig 436.37207651316345
LandSlope 13073.642778996655
Neighborhood 526.4694786934698
Condition1 -1103.1852631892102
Condition2 -16874.844041664564
BldgType -162.97420925281892
HouseStyle -1786.038042201069
OverallQual 11555.030169713735
OverallCond 4567.187751994985
YearBuilt 268.05692566871903
YearRemodAdd 32.95715489138047
RoofStyle 1508.5035098143057
RoofMatl 4509.724080702807
Exterior1st -684.1661842553374
Exterior2nd 20.431394323363747
MasVnrArea 15.79976334719322
ExterQual -8293.677202480018
ExterCond 1087.515247212818
Foundation 1931.8202492569908
BsmtQual -9239.521833382207
BsmtCond 3417.4956902611398
BsmtExposure -3594.51516399776
BsmtFinType1 -1083.5642016475758
BsmtFinSF1 -2.036440438250793
BsmtFinType2 883.0694766962843
BsmtFinSF2 10.391412568809756
Bs

In [32]:
max(bestCoef)

48453.745822483346

In [33]:
sorted(bestCoef)

[-60094.81703752222,
 -24976.702191697404,
 -16874.844041664564,
 -11370.379787125094,
 -9239.521833382207,
 -8293.677202480018,
 -5308.383973698759,
 -3751.675999418014,
 -3594.51516399776,
 -2482.0646656466743,
 -1786.038042201069,
 -1433.9328864141562,
 -1103.1852631892102,
 -1083.5642016475758,
 -997.1313539346056,
 -710.4994021573232,
 -684.1661842553374,
 -653.4805561821493,
 -539.1140437908462,
 -387.3881992484685,
 -328.3557227810792,
 -254.7681530010559,
 -231.20210496714844,
 -202.90035103731577,
 -162.97420925281892,
 -149.33897138403566,
 -34.65229923832521,
 -26.71615700067923,
 -6.501229493315805,
 -2.108585097979246,
 -2.036440438250793,
 0.17946577509057704,
 0.34006118668912677,
 0.5156292844476411,
 1.853742618302931,
 5.137518868439656,
 9.985976795753231,
 10.391412568809756,
 14.582112881942521,
 15.516957227097919,
 15.79976334719322,
 18.45347654870784,
 20.431394323363747,
 32.95715489138047,
 33.37565060843917,
 53.43612721918248,
 73.93178288818672,
 268.05692