In [None]:
import pandas as pd
import time
import datetime
import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.metrics import log_loss, make_scorer
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
# 取出训练集中的预测目标字段
feature_names = ['Year', 'Month', 'Hour', 'DayOfWeekID', 'PdDistrictID', 'HasBlock', 'PositionTypeID', 'X', 'Y']

In [None]:
origin_data = pd.read_csv("../datasets/train_preprocess.csv")

In [None]:
X = origin_data[feature_names]

In [None]:
target_label = origin_data['Category']
target_enc = LabelEncoder()
y_true = target_enc.fit_transform(target_label)

In [None]:
y_true[:10]

In [None]:
xgbclf = XGBClassifier(max_depth=6,
                       learning_rate=0.1, 
                       n_estimators=1, 
                       objective="multi:softprob",
                       n_job=-1,
                       gamma=2,
                       min_child_weight=10,
                       max_delta_step=2,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=10
                    )

In [None]:
train_size, train_score, test_score = learning_curve(estimator=xgbclf,
                                                    X=X, y=target_label, train_sizes=[0.001],
                                                    cv=3,scoring=make_scorer(log_loss), n_jobs=-1)

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
split = StratifiedKFold(n_splits=3, shuffle=True)
sample = origin_data.sample(frac=0.01)
print(sample.shape)
for train_index, test_index in split.split(sample[feature_names], sample['Category']):
    print(len(np.unique(sample['Category'].values[test_index])))