In [20]:
import xgboost
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import ParameterGrid
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import accuracy_score
import datetime


def date_parser(df):
    date_recorder = list(map(lambda x: datetime.datetime.strptime(str(x), '%Y-%m-%d'),
                             df['date_recorded'].values))
    df['year_recorder'] = list(map(lambda x: int(x.strftime('%Y')), date_recorder))
    df['weekday_recorder'] = list(map(lambda x: int(x.strftime('%w')), date_recorder))
    df['yearly_week_recorder'] = list(map(lambda x: int(x.strftime('%W')), date_recorder))
    df['month_recorder'] = list(map(lambda x: int(x.strftime('%m')), date_recorder))
    df['age'] = df['year_recorder'].values - df['construction_year'].values
    del df['date_recorded']
    return df


def col_to_freq(df, col_names):
    for col in col_names:
        print('Changing to frequency %s' %col)
        val_counts = df[col].value_counts()
        df[col + '_freq'] = np.zeros((df.shape[0],))
        for i, val in enumerate(df[col].values):
            df[col + '_freq'].iat[i] = int(val_counts.at[val])
    return df


def evalerror(preds, dtrain):
    """
    accuracy calculation function for xgboost
    :param preds: predictions
    :param dtrain: labels
    :return: -1 * accuracy (for minimization)
    """
    labels = dtrain.get_label()
    # return a pair metric_name, result
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'Accuracy', -1 * float(sum(labels == preds)) / len(labels)

"""
Import data
"""
train = pd.DataFrame.from_csv('C:/Users/Sam/Desktop/AEGIS Classes/ML_whole/DataDriven/train.csv')
train_index = train.index.values
test = pd.DataFrame.from_csv('C:/Users/Sam/Desktop/AEGIS Classes/ML_whole/DataDriven/test.csv')
test_index = test.index.values

# combing tran and test data
# helps working on all the data and removes factorization problems between train and test
dataframe = pd.concat([train, test], axis=0)

train_labels = pd.DataFrame.from_csv('C:/Users/Sam/Desktop/AEGIS Classes/ML_whole/DataDriven/trainlabels.csv')

submission_file = pd.DataFrame.from_csv("C:/Users/Sam/Desktop/AEGIS Classes/ML_whole/DataDriven/SubmissionFormat.csv")

"""
Preprocess: date parsing already done
"""
# Change labels to ints in order to use as y vector
label_encoder = LabelEncoder()
train_labels.iloc[:, 0] = label_encoder.fit_transform(train_labels.values.flatten())


"""
Split into train and test
"""

train = dataframe.loc[train_index]
test = dataframe.loc[test_index]

"""
CV
"""
best_score = 0
best_params = 0
best_train_prediction = 0
best_prediction = 0
meta_solvers_train = []
meta_solvers_test = []
best_train = 0
best_test = 0

# Optimization parameters
early_stopping = 150
param_grid = [
              # For optimization
              # {
              #  'silent': [1],
              #  'nthread': [3],
              #  # 'eval_metric': ['evalerror'],
              #  'eta': [0.1],
              #  'objective': ['multi:softmax'],
              #  'max_depth': [6],
              #  'num_round': [10000],
              #  'gamma': [0],
              #  'subsample': [0.8],
              #  'colsample_bytree': [0.3],
              #  'n_monte_carlo': [1],
              #  'cv_n': [4],
              #  'test_rounds_fac': [1],
              #  'count_n': [0],
              #  'mc_test': [True],
              #  'num_class': [3]
              #  },
              # For final calculation
              {
               'silent': [1],
               'nthread': [3],
               # 'eval_metric': ['evalerror'],
               'eta': [0.03],
               'objective': ['multi:softmax'],
               'max_depth': [13],
               'num_round': [10000],
               'gamma': [0, 1, 2, 4, 8, 16],
               'subsample': [0.8],
               'colsample_bytree': [0.2],
               'n_monte_carlo': [1],
               'cv_n': [4],
               'test_rounds_fac': [1],
               'count_n': [0],
               'mc_test': [True],
               'num_class': [3]
               }
              ]

print('start CV optimization')
mc_round_list = []
mc_acc_mean = []
mc_acc_sd = []
params_list = []
print_results = []
for params in ParameterGrid(param_grid):
    print(params)
    params_list.append(params)
    train_predictions = np.ones((train.shape[0],))
    print('There are %d columns' % train.shape[1])

    # CV
    mc_auc = []
    mc_round = []
    mc_train_pred = []
    # Use monte carlo simulation if needed to find small improvements
    for i_mc in range(params['n_monte_carlo']):
        cv_n = params['cv_n']
        kf = StratifiedKFold(train_labels.values.flatten(), n_folds=cv_n, shuffle=True, random_state=i_mc ** 3)

        xgboost_rounds = []
        # Finding optimized number of rounds
        for cv_train_index, cv_test_index in kf:
            X_train, X_test = train.values[cv_train_index, :], train.values[cv_test_index, :]
            y_train = train_labels.iloc[cv_train_index].values.flatten()
            y_test = train_labels.iloc[cv_test_index].values.flatten()

            # train machine learning
            xg_train = xgboost.DMatrix(X_train, label=y_train)
            xg_test = xgboost.DMatrix(X_test, label=y_test)

            watchlist = [(xg_train, 'train'), (xg_test, 'test')]

            num_round = params['num_round']
            xgclassifier = xgboost.train(params, xg_train, num_round, watchlist, early_stopping_rounds=early_stopping,
                                         feval=evalerror);
            xgboost_rounds.append(xgclassifier.best_iteration)

        num_round = int(np.mean(xgboost_rounds))
        print('The best n_rounds is %d' % num_round)

        # Calculate train predictions over optimized number of rounds
        for cv_train_index, cv_test_index in kf:
            X_train, X_test = train.values[cv_train_index, :], train.values[cv_test_index, :]
            y_train = train_labels.iloc[cv_train_index].values.flatten()
            y_test = train_labels.iloc[cv_test_index].values.flatten()

            # train machine learning
            xg_train = xgboost.DMatrix(X_train, label=y_train)
            xg_test = xgboost.DMatrix(X_test, label=y_test)

            watchlist = [(xg_train, 'train'), (xg_test, 'test')]

            xgclassifier = xgboost.train(params, xg_train, num_round, watchlist, feval=evalerror);

            # predict
            predicted_results = xgclassifier.predict(xg_test)
            train_predictions[cv_test_index] = predicted_results

        print('Accuracy score ', accuracy_score(train_labels.values, train_predictions))
        mc_auc.append(accuracy_score(train_labels.values, train_predictions))
        mc_train_pred.append(train_predictions)
        mc_round.append(num_round)

    # Getting the mean integer
    mc_train_pred = (np.mean(np.array(mc_train_pred), axis=0) + 0.5).astype(int)

    mc_round_list.append(int(np.mean(mc_round)))
    mc_acc_mean.append(np.mean(mc_auc))
    mc_acc_sd.append(np.std(mc_auc))
    print('The accuracy range is: %.5f to %.5f and best n_round: %d' %
          (mc_acc_mean[-1] - mc_acc_sd[-1], mc_acc_mean[-1] + mc_acc_sd[-1], mc_round_list[-1]))
    print_results.append('The accuracy range is: %.5f to %.5f and best n_round: %d' %
                         (mc_acc_mean[-1] - mc_acc_sd[-1], mc_acc_mean[-1] + mc_acc_sd[-1], mc_round_list[-1]))
    print('For ', mc_auc)
    print('The accuracy of the average prediction is: %.5f' % accuracy_score(train_labels.values, mc_train_pred))
    meta_solvers_train.append(mc_train_pred)

    # train machine learning
    xg_train = xgboost.DMatrix(train.values, label=train_labels.values)
    xg_test = xgboost.DMatrix(test.values)

    # predicting the test set
    if params['mc_test']:
        watchlist = [(xg_train, 'train')]

        num_round = int(mc_round_list[-1] * params['test_rounds_fac'])
        mc_pred = []
        for i_mc in range(params['n_monte_carlo']):
            params['seed'] = i_mc
            xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten())
            xg_test = xgboost.DMatrix(test)

            watchlist = [(xg_train, 'train')]

            xgclassifier = xgboost.train(params, xg_train, num_round, watchlist, feval=evalerror);
            predicted_results = xgclassifier.predict(xg_test)
            mc_pred.append(predicted_results)

        meta_solvers_test.append((np.mean(np.array(mc_pred), axis=0) + 0.5).astype(int))
        """ Write opt solution """
        print('writing to file')
        mc_train_pred = label_encoder.inverse_transform(mc_train_pred.astype(int))
        print(meta_solvers_test[-1])
        meta_solvers_test[-1] = label_encoder.inverse_transform(meta_solvers_test[-1])
        pd.DataFrame(mc_train_pred).to_csv('results/train_xgboost_d13.csv')
        submission_file['status_group'] = meta_solvers_test[-1]
        submission_file.to_csv("results/test_xgboost_d13.csv")

    # saving best score for printing
    if mc_acc_mean[-1] < best_score:
        print('new best log loss')
        best_score = mc_acc_mean[-1]
        best_params = params
        best_train_prediction = mc_train_pred
        if params['mc_test']:
            best_prediction = meta_solvers_test[-1]

print(best_score)
print(best_params)

print(params_list)
print(print_results)
print(mc_acc_mean)
print(mc_acc_sd)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x00000262C9A350B8>>
Traceback (most recent call last):
  File "C:\Users\Sam\AppData\Local\Continuum\anaconda3\lib\site-packages\xgboost\core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


start CV optimization
{'colsample_bytree': 0.2, 'count_n': 0, 'cv_n': 4, 'eta': 0.03, 'gamma': 0, 'max_depth': 13, 'mc_test': True, 'n_monte_carlo': 1, 'nthread': 3, 'num_class': 3, 'num_round': 10000, 'objective': 'multi:softmax', 'silent': 1, 'subsample': 0.8, 'test_rounds_fac': 1}
There are 39 columns


ValueError: could not convert string to float: 'communal standpipe'