# <span style="color:Green">もくもくスタディ</spane> Kaggle チャレンジ

## <span style="color:Blue">2017年04月16日</span>

## Thanks to Takkii 先生

<center>
<span style="color:Green">浅川伸一</span>
</center>

## ライブラリのインポート

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## データの読み込み

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

## データの確認

In [None]:
train.head()

In [None]:
train.columns

In [None]:
GrandTruth = train['SalePrice']

## 欠損値のチェック

In [None]:
NAs = pd.concat([train.isnull().sum(), test.isnull().sum()], axis=1, keys=['Train', 'Test'])

In [None]:
NAs

In [None]:
NAs[NAs.sum(axis=1) > 0]

In [None]:
train.values.shape

## データから従属変数  'SalePrice' を削除

In [None]:
train_labels = train.pop('SalePrice')

In [None]:
train.columns

In [None]:
features = pd.concat([train, test], keys=['train', 'test'])

---

# 滝沢先生によれば目的変数の SalesPrice は偏っているので

# 対数変換して正規分布に近づけてしまう

In [None]:
ax = sns.distplot(train_labels)

## 対数変換！

In [None]:
train_labels = np.log(train_labels)

## 変換後のプロット

In [None]:
ax = sns.distplot(train_labels)

## <span style="color:Green">マッチ ベター</span> な感じになりました

---

## 欠損値の多い変数は捨てる

In [None]:
features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 
               'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF',
               'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 
               'GarageArea', 'GarageCond', 'WoodDeckSF', 'OpenPorchSF', 
               'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 
               'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'],
              axis=1, inplace=True)

In [None]:
features.columns

In [None]:
features.drop(['MSSubClass','MSZoning', 'LotFrontage', 'Alley', 'MasVnrType',
               'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
               'BsmtFinType2', 'TotalBsmtSF', 'Electrical', 'KitchenAbvGr',
               'KitchenQual', 'FireplaceQu', 'GarageType', 'GarageFinish', 
               'GarageQual', 'GarageCars', 'SaleType', 'YrSold','MoSold',
               'TotalBsmtSF', '1stFlrSF', '2ndFlrSF'],
              axis=1, inplace=True)

In [None]:
features.describe()

## データを標準化 a.k.a 白色化

## 平均を引いて標準偏差で割る

In [None]:
#(features - features.mean())/features.std()

#features.std()
features.mean()

In [None]:
numeric_features_standardized = (features - features.mean())/features.std()

In [None]:
numeric_features_standardized.describe()

In [None]:
### Splitting features
train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values

---

# 関数定義 決定係数 <span style="color:Red">R2</span> と 平均自乗誤差 <span style="Color:Red">RMSE</sapn> の表示

In [None]:
def get_score(prediction, lables):    
    print('R2: {}'.format(r2_score(prediction, lables)))
    print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables))))

## 訓練データセットと検証データセットの値を表示する関数の定義

In [None]:
def train_test(estimator, x_trainn, x_test, y_trainn, y_test):
    prediction_train = estimator.predict(x_train)
    print(estimator)
    get_score(prediction_train, y_trainn)
    prediction_test = estimator.predict(x_test)
    print("テストデータ")
    get_score(prediction_test, y_test)

---

### 訓練データのシャッフル

In [None]:
train_features, train_labels = shuffle(train_features,
                                       train_labels, 
                                       random_state = 5)

### データの分割

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train_features, 
                                                    train_labels, 
                                                    test_size=0.1)

---

## エラスティックネットを使う

In [None]:
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], 
                                    l1_ratio=[.01, .1, .5, .9, .99], 
                                    max_iter=5000).fit(x_train, y_train)
train_test(ENSTest, x_train, x_test, y_train, y_test)

## 平均決定係数と交差妥当性の標準偏差 5 までの値

In [None]:
scores = cross_val_score(ENSTest, train_features, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

---

## 勾配ブースト法によるモデルの訓練

In [None]:
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, 
                                           learning_rate=0.05, 
                                           max_depth=3, 
                                           max_features='sqrt',
                                           min_samples_leaf=15, 
                                           min_samples_split=10, 
                                           loss='huber').fit(x_train, y_train)
train_test(GBest, x_train, x_test, y_train, y_test)

## 平均決定係数と交差妥当性の標準偏差 5 までの値

In [None]:
scores = cross_val_score(GBest, train_features, train_labels, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

---

## 再訓練

In [None]:
GB_model = GBest.fit(train_features, train_labels)
ENST_model = ENSTest.fit(train_features, train_labels)

## SalePrice の推定

In [None]:
Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features))) / 2

In [None]:
Final_labels

In [None]:
GrandTruth.describe()

In [None]:
Final_labels.shape

In [None]:
plt.scatter(GrandTruth[:-1].values,Final_labels)
plt.show()

In [None]:
## Saving to CSV
pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('2017-04-16.csv', index=False)    