In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm

## csvからdataを読み込む

In [2]:
data = pd.read_csv("train.csv")
data_cat = data.select_dtypes(include=[object])
data_num = data.select_dtypes(include=[np.number])

## dataを整形する
- 不要なcolumnの削除. 
- 欠陥したdataを埋める. 
- 値を数値化する. 

In [3]:
data_cat.isnull().sum()

MSZoning            0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           37
BsmtCond           37
BsmtExposure       38
BsmtFinType1       37
BsmtFinType2       38
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         0
Functional          0
FireplaceQu       690
GarageType         81
GarageFinish       81
GarageQual         81
GarageCond         81
PavedDrive          0
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
dtype: int64

In [4]:
data_num.isnull().sum()

Id                 0
MSSubClass         0
LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
SalePrice          0
dtype: int64

In [5]:
# columnの削除. 
data_cat = data_cat.drop(["Alley", "PoolQC", "Fence", "MiscFeature"], axis=1)

In [6]:
# nullの値を最頻値で埋めている. 
data_cat.BsmtCond.fillna(data_cat.BsmtCond.value_counts().idxmax(), inplace=True)
data_cat.BsmtQual.fillna(data_cat.BsmtQual.value_counts().idxmax(), inplace=True)
data_cat.BsmtExposure.fillna(data_cat.BsmtExposure.value_counts().idxmax(), inplace=True)
data_cat.BsmtFinType1.fillna(data_cat.BsmtFinType1.value_counts().idxmax(), inplace=True)
data_cat.BsmtFinType2.fillna(data_cat.BsmtFinType2.value_counts().idxmax(), inplace=True)
data_cat.FireplaceQu.fillna(data_cat.FireplaceQu.value_counts().idxmax(), inplace=True)
data_cat.GarageCond.fillna(data_cat.GarageCond.value_counts().idxmax(), inplace=True)
data_cat.GarageFinish.fillna(data_cat.GarageFinish.value_counts().idxmax(), inplace=True)
data_cat.GarageQual.fillna(data_cat.GarageQual.value_counts().idxmax(), inplace=True)
data_cat.GarageType.fillna(data_cat.GarageType.value_counts().idxmax(), inplace=True)
data_cat.Electrical.fillna(data_cat.Electrical.value_counts().idxmax(), inplace=True)
data_cat.MasVnrType.fillna(data_cat.MasVnrType.value_counts().idxmax(), inplace=True)

In [7]:
# nullの値を平均値で埋めている. 
data_num.LotFrontage.fillna(data_num.LotFrontage.mean(), inplace=True)
data_num.GarageYrBlt.fillna(data_num.GarageYrBlt.mean(), inplace=True)
data_num.MasVnrArea.fillna(data_num.MasVnrArea.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [8]:
# 値を数値化する. 
le = LabelEncoder()
data_cat = data_cat.apply(le.fit_transform)
data = pd.concat([data_cat, data_num], axis=1)

## 統計モデルを用いて予測する

In [9]:
X = data.drop(["SalePrice"], axis=1)
Y = pd.DataFrame(data["SalePrice"])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20)

In [10]:
est = sm.OLS(Y_train, X_train)
est2 = est.fit()

## 予測の評価を表示する

In [11]:
est2.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared (uncentered):,0.981
Model:,OLS,Adj. R-squared (uncentered):,0.979
Method:,Least Squares,F-statistic:,747.8
Date:,"Sun, 04 Jul 2021",Prob (F-statistic):,0.0
Time:,05:08:35,Log-Likelihood:,-13598.0
No. Observations:,1168,AIC:,27340.0
Df Residuals:,1094,BIC:,27720.0
Df Model:,74,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
MSZoning,-507.0752,1513.562,-0.335,0.738,-3476.887,2462.737
Street,3.619e+04,1.38e+04,2.624,0.009,9123.462,6.33e+04
LotShape,-261.0602,671.878,-0.389,0.698,-1579.376,1057.256
LandContour,1824.4825,1338.879,1.363,0.173,-802.578,4451.543
Utilities,-3.027e+04,3.02e+04,-1.002,0.317,-8.96e+04,2.9e+04
LotConfig,-406.3139,555.129,-0.732,0.464,-1495.551,682.923
LandSlope,4255.0328,3917.411,1.086,0.278,-3431.455,1.19e+04
Neighborhood,299.0145,154.419,1.936,0.053,-3.977,602.006
Condition1,-897.7170,995.636,-0.902,0.367,-2851.288,1055.854

0,1,2,3
Omnibus:,498.589,Durbin-Watson:,1.934
Prob(Omnibus):,0.0,Jarque-Bera (JB):,57629.782
Skew:,-0.954,Prob(JB):,0.0
Kurtosis:,37.359,Cond. No.,1.38e+16
