In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from sklearn import linear_model, tree, ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import scipy.stats as st
import seaborn as sns
import statsmodels.api as sm

In [45]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [46]:
qual_cols = [f for f in train.columns if train.dtypes[f] == 'object']
quant_cols = [f for f in train.columns if train.dtypes[f] != 'object']

In [47]:
#Save the 'Id' column
train_ID = train['Id']
test_ID = test['Id']

#Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

train.rename(columns={'3SsnPorch':'TSsnPorch'}, inplace=True)
test.rename(columns={'3SsnPorch':'TSsnPorch'}, inplace=True)

test['SalePrice'] = 0

In [48]:
def rstr(df, pred=None): 
    obs = df.shape[0]
    types = df.dtypes
    counts = df.apply(lambda x: x.count())
    uniques = df.apply(lambda x: x.unique())
    nulls = df.apply(lambda x: x.isnull().sum())
    distincts = df.apply(lambda x: x.unique().shape[0])
    missing_ration = (df.isnull().sum()/ obs) * 100
    skewness = df.skew()
    kurtosis = df.kurt() 
    print('Data shape:', df.shape)
    
    if pred is None:
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing ration', 'uniques', 'skewness', 'kurtosis']
        mstr = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis], axis = 1)

    else:
        corr = df.corr()[pred]
        mstr = pd.concat([types, counts, distincts, nulls, missing_ration, uniques, skewness, kurtosis, corr], axis = 1, sort=False)
        corr_col = 'corr '  + pred
        cols = ['types', 'counts', 'distincts', 'nulls', 'missing_ration', 'uniques', 'skewness', 'kurtosis', corr_col ]
    
    print(mstr)
    
    mstr.columns = cols
    dtypes = mstr.types.value_counts()
    print('___________________________\nData types:\n',mstr.types.value_counts())
    print('___________________________')
    return mstr

In [49]:
details = rstr(train, 'SalePrice')
display(details.sort_values(by='corr SalePrice', ascending=False))

Data shape: (1460, 80)
                     0     1     2    3          4  \
MSSubClass       int64  1460    15    0   0.000000   
MSZoning        object  1460     5    0   0.000000   
LotFrontage    float64  1201   111  259  17.739726   
LotArea          int64  1460  1073    0   0.000000   
Street          object  1460     2    0   0.000000   
...                ...   ...   ...  ...        ...   
MoSold           int64  1460    12    0   0.000000   
YrSold           int64  1460     5    0   0.000000   
SaleType        object  1460     9    0   0.000000   
SaleCondition   object  1460     6    0   0.000000   
SalePrice        int64  1460   663    0   0.000000   

                                                               5          6  \
MSSubClass     [60, 20, 70, 50, 190, 45, 90, 120, 30, 85, 80,...   1.407657   
MSZoning                               [RL, RM, C (all), FV, RH]        NaN   
LotFrontage    [65.0, 80.0, 68.0, 60.0, 84.0, 85.0, 75.0, nan...   2.163569   
LotArea     

  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,types,counts,distincts,nulls,missing_ration,uniques,skewness,kurtosis,corr SalePrice
SalePrice,int64,1460,663,0,0.000000,"[208500, 181500, 223500, 140000, 250000, 14300...",1.882876,6.536282,1.000000
OverallQual,int64,1460,10,0,0.000000,"[7, 6, 8, 5, 9, 4, 10, 3, 1, 2]",0.216944,0.096293,0.790982
GrLivArea,int64,1460,861,0,0.000000,"[1710, 1262, 1786, 1717, 2198, 1362, 1694, 209...",1.366560,4.895121,0.708624
GarageCars,int64,1460,5,0,0.000000,"[2, 3, 1, 0, 4]",-0.342549,0.220998,0.640409
GarageArea,int64,1460,441,0,0.000000,"[548, 460, 608, 642, 836, 480, 636, 484, 468, ...",0.179981,0.917067,0.623431
...,...,...,...,...,...,...,...,...,...
PoolQC,object,7,4,1453,99.520548,"[nan, Ex, Fa, Gd]",,,
Fence,object,281,5,1179,80.753425,"[nan, MnPrv, GdWo, GdPrv, MnWw]",,,
MiscFeature,object,54,5,1406,96.301370,"[nan, Shed, Gar2, Othr, TenC]",,,
SaleType,object,1460,9,0,0.000000,"[WD, New, COD, ConLD, ConLI, CWD, ConLw, Con, ...",,,


In [50]:
train.corr()['SalePrice'].sort_values(ascending=False)

SalePrice        1.000000
OverallQual      0.790982
GrLivArea        0.708624
GarageCars       0.640409
GarageArea       0.623431
TotalBsmtSF      0.613581
1stFlrSF         0.605852
FullBath         0.560664
TotRmsAbvGrd     0.533723
YearBuilt        0.522897
YearRemodAdd     0.507101
GarageYrBlt      0.486362
MasVnrArea       0.477493
Fireplaces       0.466929
BsmtFinSF1       0.386420
LotFrontage      0.351799
WoodDeckSF       0.324413
2ndFlrSF         0.319334
OpenPorchSF      0.315856
HalfBath         0.284108
LotArea          0.263843
BsmtFullBath     0.227122
BsmtUnfSF        0.214479
BedroomAbvGr     0.168213
ScreenPorch      0.111447
PoolArea         0.092404
MoSold           0.046432
TSsnPorch        0.044584
BsmtFinSF2      -0.011378
BsmtHalfBath    -0.016844
MiscVal         -0.021190
LowQualFinSF    -0.025606
YrSold          -0.028923
OverallCond     -0.077856
MSSubClass      -0.084284
EnclosedPorch   -0.128578
KitchenAbvGr    -0.135907
Name: SalePrice, dtype: float64

In [61]:
important_num_cols = list(train.corr()["SalePrice"][(train.corr()["SalePrice"]>0.45) | (train.corr()["SalePrice"]<-0.45)].index)
cat_cols = ["Neighborhood", "HeatingQC", "SaleCondition", "KitchenQual", "ExternQual", "BsmtQual"]
important_cols = important_num_cols + cat_cols

In [None]:
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
train_df['Age2022'] = 2022 - train_df['YearBuilt']