In [1]:
import pandas as pd
housing_df = pd.read_csv('./housing.csv')
housing_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
housing_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## 探索型数据分析

In [3]:
import seaborn as sns
# 绘制房屋价格分布图
sns.distplot(housing_df['SalePrice'])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bb851a548>

1. 可以从图中看到，SalePrice的分布图大致服从右偏态分布，平均数大于中位数
2. 大部分房屋价格在100000到300000之间
3. 有少量的房屋价格大于500000

In [7]:
# 面积分布图
sns.distplot(housing_df['GrLivArea'])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bb9010888>

In [17]:
# 地下室面积分布图
sns.distplot(housing_df['TotalBsmtSF'])

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bba656648>

地下室有零的数据，也就是没有地下室，把它置为空值

In [12]:
import numpy as np
housing_df['TotalBsmtSF'] = housing_df['TotalBsmtSF'].replace({0: np.NAN})

### 连续型变量之间的相关性

In [10]:
# 居住面积和房价的相关性
sns.regplot(x='GrLivArea', y='SalePrice', data=housing_df)
# 居住面积集中在700~2800左右，两者是正相关关系

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bb973f108>

In [14]:
# 地下室面积和房价的相关性
sns.regplot(x='TotalBsmtSF', y='SalePrice', data=housing_df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bb9757c48>

In [18]:
### 探索离散型变量和连续型变量之间的相关性

In [16]:
sns.boxplot(x='OverallQual', y='SalePrice', data=housing_df)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bba2bf688>

相关系数矩阵热力图

In [19]:
info = ['SalePrice','GrLivArea','TotalBsmtSF','GarageArea','YearBuilt','OverallQual']
sns.heatmap(housing_df[info].corr(), annot = True, vmin = 0, vmax = 1)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x25bba7689c8>

从代码生成的热力图可以看出，SalePrice跟OverallQual的相关性最高，相关系数为0.79，SalePrice跟YearBuilt的相关性最低，相关系数为0.52

## 预测型数据分析

1. 使用探索型数据分析方法，探索跟房价相关的特征
2. 对相关的特征进行数据处理使其更符合数据建模的要求
3. 特征挑选应该同时注重模型的准确率和可解释性

In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
housing_df['TotalBsmtSF'].fillna(0, inplace=True)
features = ['GrLivArea', 'TotalBsmtSF','GarageArea','YearBuilt','OverallQual']
target = 'SalePrice'
lr = LinearRegression()
rf = RandomForestRegressor(100)
models = [lr, rf]
for model in models:
    scroe = cross_val_score(model, housing_df[features], housing_df[target], cv=5, scoring='neg_mean_absolute_error').mean()
    print(type(model).__name__, scroe)

LinearRegression -24462.478063162718
RandomForestRegressor -20512.32682215598
