# Zillow Prediction

* dtypes, fillnull, datatime
* 相关性分析，去除不相关的特征
* 确定问题是回归问题
* 选择模型直接的相关度，进行融合

In [1]:
# Basic packages
import csv
import time
import math
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
# from scipy.stats import uniform, randint

# Model Persistence
# import cPickle

# Machine Learning
from sklearn import model_selection, preprocessing, metrics
from sklearn import ensemble, linear_model
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.semi_supervised import LabelSpreading
from sklearn.model_selection import cross_val_score,cross_val_predict, train_test_split, StratifiedKFold
from sklearn.neural_network import BernoulliRBM, MLPClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, scorer, make_scorer

# XGBoost
# import xgboost as xgb
# from xgboost import XGBClassifier

# Ensemble Learning
from mlens.ensemble import SuperLearner, SequentialEnsemble
from mlens.metrics import make_scorer
from mlens.model_selection import Evaluator
from mlens.preprocessing import EnsembleTransformer
from mlens.visualization import corr_X_y, corrmat

# Imbalanced Learning
# import imblearn
# from imblearn.combine import SMOTETomek

# Count
from collections import Counter

# Visualization
import seaborn as sns
color = sns.color_palette()

%matplotlib inline
import matplotlib.pyplot as plt

# Random Seed
SEED = 2017

"""
载入原始数据集，保留所有的数据特点，
用于特征工程分析、半监督学习
"""
def load_original():
    # u'parcelid', ...... (total 58 columns)
    properties16 = pd.read_csv('Data/properties_2016.csv')             # 特征数据集，无重复parcelid
    # u'parcelid', u'logerror', u'transactiondate
    train16 = pd.read_csv('Data/train_2016_v2.csv', parse_dates=["transactiondate"])   # 训练数据集，无重复行,但有重复parcelid
    # u'ParcelId', u'201610', u'201611', u'201612', u'201710', u'201711', u'201712'
    sample_submission = pd.read_csv('Data/sample_submission.csv')      # 测试数据集，无重复ParcelId

    print 'properties16.shape\t', properties16.shape
    print 'train16.shape\t\t', train16.shape
    print 'sample_submission.shape\t', sample_submission.shape
    
    return properties16, train16, sample_submission


"""
载入训练数据集，放弃特征数据集中的无标签/logerror数据，
用于有监督学习
"""
def load_supervision():
    """Read in training data and return input, output, columns tuple."""

    # This is a version of Anovas minimally prepared dataset
    # for the xgbstarter script
    # https://www.kaggle.com/anokas/simple-xgboost-starter-0-0655

    df = pd.read_csv('Data/train_2016_v2.csv')

    prop = pd.read_csv('Data/properties_2016.csv')
    convert = prop.dtypes == 'float64'
    prop.loc[:, convert] = prop.loc[:, convert].apply(lambda x: x.astype(np.float32))

    df = df.merge(prop, how='left', on='parcelid')

    y = df.logerror
    df = df.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'taxdelinquencyflag', 'propertycountylandusecode'], axis=1)

    convert = df.dtypes == 'object'
    df.loc[:, convert] = df.loc[:, convert].apply(lambda x: 1 * (x == True))

    df.fillna(0, inplace=True)
    
    # test
    sample = pd.read_csv('Data/sample_submission.csv')
    sample['parcelid'] = sample['ParcelId']
    sample = sample[['parcelid']].merge(prop, how='left', on='parcelid')
    sample = sample.drop(['parcelid', 'propertyzoningdesc', 'taxdelinquencyflag', 'propertycountylandusecode'], axis=1)
    
    convert = sample.dtypes == 'object'
    sample.loc[:, convert] = sample.loc[:, convert].apply(lambda x: 1 * (x == True))
    
    sample.fillna(0, inplace=True)
    
    return df, y, sample, df.columns

# Feature Engineering

**分析特征交叉，产生新特征**

In [None]:
X, y, X_test, columns = load_supervision()

In [3]:
print 'X.shape:', X.shape
print 'y.shape:', y.shape
print 'X_test.shape:', X_test.shape

X.shape: (90275, 54)
y.shape: (90275L,)
X_test.shape: (2985217, 54)


In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [14]:
n = 500
# 2 latents vars:
l1 = np.random.normal(size=n)
l2 = np.random.normal(size=n)
latents = np.array([l1, l1, l2, l2]).T
X = latents + np.random.normal(size=4*n).reshape((n, 4))
Y = latents + np.random.normal(size=4*n).reshape((n, 4))
X_train = X[:n / 2]
y_train = Y[:n / 2]
X_valid = X[n / 2:]
y_valid = Y[n / 2:]
print("Corr(X)")
print(np.round(np.corrcoef(X.T), 2))
print("Corr(Y)")
print(np.round(np.corrcoef(Y.T), 2))

Corr(X)
[[ 1.    0.51 -0.1  -0.03]
 [ 0.51  1.    0.   -0.02]
 [-0.1   0.    1.    0.49]
 [-0.03 -0.02  0.49  1.  ]]
Corr(Y)
[[ 1.    0.51  0.06 -0.01]
 [ 0.51  1.   -0.01  0.02]
 [ 0.06 -0.01  1.    0.49]
 [-0.01  0.02  0.49  1.  ]]


In [16]:
from sklearn.cross_decomposition import PLSCanonical, PLSRegression, CCA

plsca = PLSCanonical(n_components=2)
plsca.fit(X_train, y_train)
X_train_r, y_train_r = plsca.transform(X_train, y_train)
X_valid_r, y_valid_r = plsca.transform(X_valid, y_valid)

# print metrics.mean_absolute_error(y_train, y_train_r)
# print metrics.mean_absolute_error(y_valid, y_valid_r)

In [18]:
y_train_r

array([[-0.22122168, -0.68852125],
       [ 1.31097076,  0.41774589],
       [-1.31034155, -0.28116589],
       [ 2.53023239, -1.50858901],
       [-1.73196868, -3.8373544 ],
       [ 2.08869478, -0.88579433],
       [ 1.33257415, -0.44671041],
       [-0.45580325, -0.73192746],
       [ 2.93964553,  1.59389909],
       [-0.98382292, -0.23253507],
       [ 0.86426913, -1.39510186],
       [-0.19432777, -0.12186885],
       [ 1.20461515,  1.48448805],
       [ 2.04159301,  1.51152167],
       [ 0.77391181, -0.91013512],
       [ 0.09224536, -0.6229514 ],
       [-0.07123024,  0.87560973],
       [ 1.18279988, -0.42656831],
       [ 2.14299478, -1.36177214],
       [ 1.29836385, -1.22788604],
       [-0.45411379,  1.56351929],
       [-0.4502812 ,  0.05959701],
       [-0.97691117,  1.14399935],
       [-0.25188652,  0.45561419],
       [-2.94335955, -1.32607226],
       [-0.17755661, -1.88561599],
       [ 0.25805053, -0.82906481],
       [ 0.79731266,  0.41474386],
       [-0.95375292,

**探索性分析**

In [None]:
properties16, train16, sample_submission = load_original()

Let's watch these data.

In [None]:
properties16.head(3)

In [None]:
train16.head(3)

In [None]:
sample_submission.head(3)

Merge train and feature to the whole train file

In [None]:
train_df = pd.merge(train16, properties16, on='parcelid', how='left')
train_df.head(3)

There are so many NaN in features.  
We consider mean value inplace origial value.

In [None]:
mean_values = train_df.mean(axis=0)
train_df_new = train_df.fillna(mean_values, inplace=True)
train_df_new.head(3)

In [None]:
train_df_new.info()

#### Univariate Analysis:
Since there are so many variables, let us first take the 'float' variables alone and then get the correlation with the target variable to see how they are related.

In [None]:
# correlation coefficient: feature VS. label #
x_cols = [col for col in train_df_new.columns if col not in ['logerror'] if train_df_new[col].dtype=='float64']

labels = []
values = []
for col in x_cols:
    labels.append(col)
    values.append(np.corrcoef(train_df_new[col].values, train_df_new.logerror.values)[0, 1])
corr_df = pd.DataFrame({'col_labels':labels, 'corr_values':values})
corr_df = corr_df.sort_values(by='corr_values')

ind = np.arange(len(labels))
width = 0.9
fig, ax = plt.subplots(figsize=(10,20))
rects = ax.barh(ind, np.array(corr_df.corr_values.values), color='y')
ax.set_yticks(ind)
ax.set_yticklabels(corr_df.col_labels.values, rotation='horizontal')
ax.set_xlabel("Correlation coefficient")
ax.set_title("Correlation coefficient of the variables")
#autolabel(rects)
plt.show()

select the feature with top correlation coefficient:

In [None]:
corr_df_sel = corr_df.ix[(corr_df['corr_values']>0.02) | (corr_df['corr_values'] < -0.01)]
corr_df_sel

Now let us look at the correlation coefficient of each of these variables:

In [None]:
# correlation coefficient: feature 1 VS. feature 2 #
cols_to_use = corr_df_sel.col_labels.tolist()

temp_df = train_df[cols_to_use]
corrmat = temp_df.corr(method='spearman')

# Draw the heatmap using seaborn
f, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(corrmat, vmax=1., square=True)
plt.title("Important variables correlation map", fontsize=15)

# Machine Learning

* Base classifiers
* Ensemble classifiers

Firstly, we need to load data:

In [None]:
X, y, X_test, columns = load_supervision()
print 'X.shape:', X.shape
print 'y.shape:', y.shape
print 'X_test.shape:', X_test.shape

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [None]:
# change dtypes of features #
# train_df_new['hashottuborspa'].fillna(0, inplace=True)
# train_df_new['hashottuborspa'][train_df_new['hashottuborspa'] == True] = 1

# en_pro = LabelEncoder()
# train_df_new['propertyzoningdesc'].fillna('Unknown', inplace=True)
# train_df_new['propertyzoningdesc'] = en_pro.fit_transform(train_df_new['propertyzoningdesc'])

# en_fire = LabelEncoder()
# train_df_new['fireplaceflag'].fillna(0, inplace=True)
# train_df_new['fireplaceflag'][train_df_new['fireplaceflag'] == True] = 1

# en_tax = LabelEncoder()
# train_df_new['taxdelinquencyflag'].fillna(0, inplace=True)
# train_df_new['taxdelinquencyflag'][train_df_new['taxdelinquencyflag'] == 'Y'] = 1

### Base Classifiers:

In [None]:
# We consider the following models (or base learners)
gb = xgb.XGBRegressor(nthread=1, seed=SEED)
ls = linear_model.Lasso(alpha=1e-6, normalize=True)
el = linear_model.ElasticNet(alpha=1e-6, normalize=True)
rf = ensemble.RandomForestRegressor(random_state=SEED)

base_learners = [('ls', ls), ('el', el), ('rf', rf), ('gb', gb)]

In [None]:
P = np.zeros((X_valid.shape[0], len(base_learners)))
P = pd.DataFrame(P, columns=[e for e, _ in base_learners])

for est_name, est in base_learners:
    est.fit(X_train, y_train.values.ravel())
    y_pred = est.predict(X_valid)
    P.loc[:, est_name] = y_pred
    print("%3s : %.4f" % (est_name, metrics.mean_absolute_error(y_valid.values.ravel(), y_pred)))

Visualize the performance of ensemble classifiers:

In [None]:
corrmat(P.corr())

### Compare Base Classifiers, search the best params of them:

In [None]:
# Put their parameter dictionaries in a dictionary with the
# estimator names as keys
param_dicts = {'ls':
                  {'alpha': uniform(1e-6, 1e-5)},
               'el':
                  {'alpha': uniform(1e-6, 1e-5),
                   'l1_ratio': uniform(0, 1)
                  },
               'gb':
                   {'learning_rate': uniform(0.02, 0.04),
                    'colsample_bytree': uniform(0.55, 0.66),
                    'min_child_weight': randint(30, 60),
                    'max_depth': randint(3, 7),
                    'subsample': uniform(0.4, 0.2),
                    'n_estimators': randint(150, 200),
                    'colsample_bytree': uniform(0.6, 0.4),
                    'reg_lambda': uniform(1, 2),
                    'reg_alpha': uniform(1, 2),
                   },
               'rf':
                   {'max_depth': randint(2, 5),
                    'min_samples_split': randint(5, 20),
                    'min_samples_leaf': randint(10, 20),
                    'n_estimators': randint(50, 100),
                    'max_features': uniform(0.6, 0.3)}
              }

scorer = make_scorer(metrics.mean_absolute_error, greater_is_better=False)

evl = Evaluator(scorer, cv=2, random_state=SEED, verbose=5)

evl.fit(X_train.values, 
        y_train.values.ravel(), 
        estimators=base_learners, 
        param_dicts=param_dicts, 
        preprocessing={'sc': [preprocessing.StandardScaler()], 'none': []},
        n_iter=2)  # bump this up to do a larger grid search

pd.DataFrame(evl.summary)

### Model selection guide

In [None]:
# # set the best params
# for case_name, params in evl.summary["params"].items():
#     for est_name, est in base_learners:
#         if est_name == case_name[1]:
#             est.set_params(**params)

# We consider the following models (or base learners)
LinearRegression = linear_model.LinearRegression()
Lasso = linear_model.Lasso(alpha=1e-6, normalize=True)
ElasticNet = linear_model.ElasticNet(alpha=1e-6, normalize=True)
RandomForest = ensemble.RandomForestRegressor(random_state=SEED)
XGBoost = xgb.XGBRegressor(nthread=1, seed=SEED)
AdaBoost = ensemble.AdaBoostRegressor(random_state=SEED)
Bagging = ensemble.BaggingRegressor(random_state=SEED)

base_learners = [('Lasso', Lasso), 
                 ('ElasticNet', ElasticNet), 
                 ('RandomForest', RandomForest),
                 ('XGBoost', XGBoost),
                 ('AdaBoost', AdaBoost),
                 ('Bagging', Bagging),
                ]


# define meta learners
meta_learners = [('XGBoost', XGBoost), 
                 ('ElasticNet', ElasticNet),
                 ('LinearRegression', LinearRegression),
                ]

# Note that when we have a preprocessing pipeline,
# keys are in the (prep_name, est_name) format
param_dicts = {'ElasticNet':
                  {'alpha': uniform(1e-5, 1),
                   'l1_ratio': uniform(0, 1)},
               'XGBoost':
                   {'learning_rate': uniform(0.01, 0.2),
                    'subsample': uniform(0.5, 0.5),
                    'reg_lambda': uniform(0.1, 1),
                    'n_estimators': randint(10, 100)},
              }

# ensemble base learners
proba_transformer = EnsembleTransformer().add('stack', base_learners, proba=False)
class_transformer = EnsembleTransformer().add('blend', base_learners, proba=False)
preprocessing = {'stack': [('layer-1', proba_transformer)],
                 'blend': [('layer-1', class_transformer)]}

# new Evaluator
scorer = make_scorer(metrics.mean_absolute_error, greater_is_better=False)
evl = Evaluator(scorer, cv=2, random_state=SEED, verbose=5)

# train meta_learners with wrapped base learners, .values
evl.fit(X_train,
        y_train,
        meta_learners,
        param_dicts,
        preprocessing=preprocessing,
        n_iter=20)    # bump this up to do a larger grid search

meta_summary = pd.DataFrame(evl.summary)
meta_summary

### 集成学习（开源实现）

In [None]:
X_train, y_train, X_test, columns = load_supervision()
print 'X_train.shape:', X_train.shape
print 'y_train.shape:', y_train.shape
print 'X_test.shape:', X_test.shape

In [None]:
X, y, X_test = X_train.values, y_train.values, X_test.values

In [None]:
clf_first_layer = [('RandomForest', ensemble.RandomForestRegressor()), 
                   # ('AdaBoost', ensemble.AdaBoostRegressor()), 
                   # ('XGBoost', xgb.XGBRegressor()), 
                   ('GBDT', ensemble.GradientBoostingRegressor()), 
                   ('Bagging', ensemble.BaggingRegressor()), 
                   ('ExtraTrees', ensemble.ExtraTreesRegressor()),
                   ('ElasticNet', linear_model.ElasticNet()),
                   ('Huber', linear_model.HuberRegressor()),
                   ('BayesianRidge', linear_model.BayesianRidge()),
                   ('Lasso', linear_model.Lasso()),
                   ('LassoLars', linear_model.LassoLars()),
                   ('LinearRegression', linear_model.LinearRegression()),
                  ]

clf_secon_layer = linear_model.LinearRegression()

In [None]:
# meta_learner = meta_learners[1][1]
# meta_learner.set_params(**evl.summary["params"][("meta", "el")])

ens = SuperLearner(folds=5, verbose=5)
ens.add(clf_first_layer)  # SequentialEnsemble: 0.0633532638459
ens.add(clf_first_layer)  # SequentialEnsemble: 0.0633258630442, SuperLearner cv 5: 0.0611558872921
ens.add(clf_first_layer)  # SuperLearner cv 5: 0.0604582704967
ens.add(clf_secon_layer, meta=True)

ens.fit(X, y)

y_pred = ens.predict(X)
print "mean_absolute_error: ", metrics.mean_absolute_error(y, y_pred)

In [None]:
y_pred = ens.predict(X_test)

In [None]:
import zipfile

print "Writing ..."
sub = pd.read_csv('Data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = y_pred
sub.to_csv('ensemble.csv', index=False, float_format='%.4f')

print "Zipping ..."
with zipfile.ZipFile("submission.zip", "w") as fout:
    fout.write("ensemble.csv", compress_type=zipfile.ZIP_DEFLATED)

In [None]:
# summary = pd.DataFrame(evl.summary)
# writer = pd.ExcelWriter('output.xlsx')
# summary.to_excel(writer,'Sheet1')
# writer.save()

### 集成学习(自己的实现)

1. 双层集成学习  
2. 使用尽可能多的分类器

In [None]:
X_train, y_train, X_test, columns = load_supervision()
print 'X_train.shape:', X_train.shape
print 'y_train.shape:', y_train.shape
print 'X_test.shape:', X_test.shape

In [None]:
X, y, X_test = X_train.values, y_train.values, X_test.values

In [None]:
def base_layer(X, y, clf_base_layer, cv=10):
    train_sec = np.zeros((len(y), len(clf_base_layer)))
    
    for i, (clf_name, clf) in enumerate(clf_base_layer):
        print time.ctime(),  'first layer classifier:', clf_name
        # cross validation
        y_score = cross_val_predict(clf, X, y, cv=cv, verbose=5, method='predict', n_jobs=-1)
        train_sec[:, i] = y_score
        print 'mean_absolute_error: ', metrics.mean_absolute_error(y, y_score)
    return train_sec


def base_layer_pred(X, y, clf_base_layer, X_test, cv=10):
    train_sec = np.zeros((len(y), len(clf_base_layer)))
    train_pre = np.zeros((len(X_test), len(clf_base_layer)))
    
    for i, (clf_name, clf) in enumerate(clf_base_layer):
        print time.ctime(),  'first layer classifier:', clf_name
        # fit and predict
        clf.n_jobs = -1
        clf.fit(X, y)
        train_sec[:, i], train_pre[:, i] = clf.predict(X), clf.predict(X_test)
    return train_sec, train_pre

In [None]:
# train
train_fea = base_layer(X, y, clf_first_layer)
train_fea = base_layer(train_fea, y, clf_first_layer)

print time.ctime(), 'second layer classifier:', 'LinearRegression'
y_score = cross_val_predict(clf_secon_layer, train_fea, y, cv=10, verbose=5, method='predict', n_jobs=-1)
print 'mean_absolute_error: ', metrics.mean_absolute_error(y, y_score)

pd.DataFrame(train_fea).to_csv('ensemble_diy.csv', index=False, header=False)

In [None]:
# test
test_fea, test_pre = base_layer_pred(X, y, clf_first_layer, X_test)
test_fea, test_pre = base_layer_pred(test_fea, y, clf_first_layer, test_pre)

# second layer
print time.ctime(), 'second layer classifier:', 'LinearRegression'
clf_secon_layer.n_jobs = -1
y_pred = clf_secon_layer.fit(test_fea, y).predict(test_pre)

In [None]:
import zipfile

print "Writing ..."
sub = pd.read_csv('Data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = y_pred
sub.to_csv('ensemble_diy.csv', index=False, float_format='%.4f')

print "Zipping ..."
with zipfile.ZipFile("ensemble_diy.zip", "w") as fout:
    fout.write("ensemble_diy.csv", compress_type=zipfile.ZIP_DEFLATED)

### Genetic Programming （遗传算法/进化规划）:

In [None]:
from sklearn.preprocessing import StandardScaler

X_train, y_train = X.values, y.values.ravel()

standardScaler = StandardScaler()
X_train = standardScaler.fit_transform(X_train)
X_test = standardScaler.transform(X_test)

In [None]:
# Genetic Programming
from gplearn.skutils import check_random_state
from gplearn.genetic import SymbolicRegressor
from sklearn.metrics import mean_absolute_error

# train GP
function_set = ['add','sub','mul','div','sqrt','log','abs','neg','inv','max','min','sin','cos','tan']

titan_gp = SymbolicRegressor(population_size=2000,
                             generations=20, 
                             stopping_criteria=0.06774,
                             p_crossover=0.7, 
                             p_subtree_mutation=0.1,
                             p_hoist_mutation=0.05, 
                             p_point_mutation=0.1,
                             max_samples=0.9, 
                             verbose=1,
                             parsimony_coefficient=0.01,
                             function_set=function_set,
                             n_jobs=1,
                             random_state=2017)
titan_gp.fit(X_train, y_train) # np.array([np.random.randint(2) for i in range(len(y_train))])

# print relationship
print 'Relationship:', titan_gp._program

In [None]:
y_pred = titan_gp.predict(X_test)

sub = pd.read_csv('Data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = y_pred
sub.to_csv('gplearn.csv', index=False, float_format='%.4f')

In [None]:
sub.shape

### 标签传播算法

1. 标签传播算法只能处理分类问题。这里先将标签去头去尾，再取其小数值构成少量标签。目的是将未标签的值标上标签，增加训练集样本。  
2. 不考虑交易日期属性：删除原训练集中的parcelid重复项，与属性集融合，构成一个半监督数据集。

In [None]:
properties16, train16, sample_submission = load_original()

In [None]:
train_test = train16.copy()

ulimit = np.percentile(train_test.logerror.values, 99)
llimit = np.percentile(train_test.logerror.values, 1)
train_test['logerror'].ix[train_test['logerror']>ulimit] = ulimit
train_test['logerror'].ix[train_test['logerror']<llimit] = llimit

plt.figure(figsize=(6,4))
sns.distplot(train_test.logerror.values, kde=True)
plt.xlabel('logerror', fontsize=12)
plt.show()

In [None]:
from decimal import Decimal
from sklearn.preprocessing import LabelEncoder

logerror_short = np.array([Decimal(str(i)).quantize(Decimal('0.0')) for i in np.sort(train_test.logerror.values)], dtype=np.float)

le = LabelEncoder()
logerror_short_label = le.fit_transform(logerror_short)

train_test['logerror'] = logerror_short_label

plt.figure(figsize=(6,4))
sns.distplot(train_test.logerror.values, kde=True)
plt.show()

对属性值做PCA

In [None]:
def build_proper():
    properties = properties16.copy()

    properties = properties.drop(['propertyzoningdesc', 'taxdelinquencyflag', 'propertycountylandusecode'], axis=1)

    properties.fillna(0, inplace=True)

    convert = properties.dtypes == 'object'
    properties.loc[:, convert] = properties.loc[:, convert].apply(lambda x: 1 * (x == True))

    pca = PCA(n_components=5)
    properties_pca = pca.fit_transform(properties)
    
    properties_pca = pd.DataFrame(properties_pca)
    properties_pca['parcelid'] = properties16['parcelid']
    
    return pd.DataFrame(properties_pca)


train = pd.merge(train_test.drop_duplicates('parcelid'), build_proper(), on="parcelid", how="outer")

# 将logerror为NaN的值换成-1
train['logerror'].fillna(-1, inplace=True)

y = train.logerror
train = train.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)

for c, dtype in zip(train.columns, train.dtypes):
    if dtype == np.float64:
        train[c] = train[c].astype(np.float32)
    if dtype == np.int64:
        train[c] = train[c].astype(np.int8)

train = train.values
y = y.values

print train.shape
print y.shape

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
train_ss = ss.fit_transform(train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_ss, y, test_size=0.005, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X_test, y_test, test_size=0.2, random_state=SEED)

print X_train.shape
print X_test.shape

# skf = StratifiedKFold(y, n_folds=10)
# for train_index, test_index in skf:
#     X_train, X_test = train_ss[train_index], train_ss[test_index]
#     y_train, y_test = y[train_index], y[test_index]
#     print("=== TRAIN:", X_train.shape, "TEST:", X_test.shape)
    
#     skf2 = StratifiedKFold(y_test, n_folds=5)
#     for train_index2, test_index2 in skf2:
#         X_train2, X_test2 = X_test[train_index2], X_test[test_index2]
#         y_train2, y_test2 = y_test[train_index2], y_test[test_index2]
#         print("TRAIN2:", X_train2.shape, "TEST2:", X_test2.shape)
#         ls.fit(X_train2, y_train2)
#         y_pred2 = ls.predict(X_test2)
#         print metrics.classification_report(y_test2, y_pred2)


In [None]:
ls = LabelSpreading()

ls.fit(X_train, y_train)
y_pred = ls.predict(X_test)
# print metrics.classification_report(y_test, y_pred)
pd.DataFrame({"True": y_test, "Pred": y_pred})

In [None]:
sns.jointplot(y_pred, y_test)