In [None]:
import datetime


def parse_dt(x):
    if not isinstance(x, str):
        return None
    elif len(x) == len('2010-01-01'):
        return datetime.datetime.strptime(x, '%Y-%m-%d')
    elif len(x) == len('2010-01-01 10:10:10'):
        return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    else:
        return None


def transform_datetime_features(df):
    datetime_columns = [
        col_name
        for col_name in df.columns
        if col_name.startswith('datetime')
    ]
    for col_name in datetime_columns:
        df[col_name] = df[col_name].apply(lambda x: parse_dt(x))
        df['number_weekday_{}'.format(col_name)] = df[col_name].apply(lambda x: x.weekday())
        df['number_month_{}'.format(col_name)] = df[col_name].apply(lambda x: x.month)
        df['number_day_{}'.format(col_name)] = df[col_name].apply(lambda x: x.day)
        df['number_hour_{}'.format(col_name)] = df[col_name].apply(lambda x: x.hour)
        df['number_hour_of_week_{}'.format(col_name)] = df[col_name].apply(lambda x: x.hour + x.weekday() * 24)
        df['number_minute_of_day_{}'.format(col_name)] = df[col_name].apply(lambda x: x.minute + x.hour * 60)
    return df


In [None]:
import argparse
import os
import pandas as pd
import pickle
import time

from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.preprocessing import StandardScaler

# use this to stop the algorithm before time limit exceeds
TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60))

ONEHOT_MAX_UNIQUE_VALUES = 20

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--train-csv', type=argparse.FileType('r'), required=True)
    parser.add_argument('--model-dir', required=True)
    parser.add_argument('--mode', choices=['classification', 'regression'], required=True)
    
    argv = ['--train-csv', r'..\check_1_r\train.csv',
           '--model-dir', r'.',
           '--mode', 'regression']
    args = parser.parse_args(argv)

    start_time = time.time()

    df = pd.read_csv(args.train_csv)
    df_y = df.target
    df_X = df.drop('target', axis=1)

    print('Dataset read, shape {}'.format(df_X.shape))

    # dict with data necessary to make predictions
    model_config = {}

    # features from datetime
    df_X = transform_datetime_features(df_X)

    # missing values
    if any(df_X.isnull()):
        model_config['missing'] = True
        df_X.fillna(-1, inplace=True)

    # categorical encoding
    categorical_values = {}
    for col_name in list(df_X.columns):
        col_unique_values = df_X[col_name].unique()
        if 2 < len(col_unique_values) <= ONEHOT_MAX_UNIQUE_VALUES:
            categorical_values[col_name] = col_unique_values
            for unique_value in col_unique_values:
                df_X['onehot_{}={}'.format(col_name, unique_value)] = (df_X[col_name] == unique_value).astype(int)
    model_config['categorical_values'] = categorical_values

    # drop constant features
    constant_columns = [
        col_name
        for col_name in df_X.columns
        if df_X[col_name].nunique() == 1
        ]
    df_X.drop(constant_columns, axis=1, inplace=True)

    # use only numeric columns
    used_columns = [
        col_name
        for col_name in df_X.columns
        if col_name.startswith('number') or col_name.startswith('onehot')
        ]
    df_X = df_X[used_columns]
    model_config['used_columns'] = used_columns

    # scaling
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_X)
    model_config['scaler'] = scaler

    # fitting
    model_config['mode'] = args.mode
    if args.mode == 'regression':
        model = Ridge()
    else:
        model = LogisticRegression()

    model.fit(df_scaled, df_y)
    model_config['model'] = model

    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'wb') as fout:
        pickle.dump(model_config, fout, protocol=pickle.HIGHEST_PROTOCOL)

    print('Train time: {}'.format(time.time() - start_time))


In [81]:
# use this to stop the algorithm before time limit exceeds
TIME_LIMIT = int(os.environ.get('TIME_LIMIT', 5*60))

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-csv', type=argparse.FileType('r'), required=True)
    parser.add_argument('--prediction-csv', type=argparse.FileType('w'), required=True)
    parser.add_argument('--model-dir', required=True)

    argv = ['--test-csv', r'..\check_1_r\test.csv',
            '--prediction-csv', r'..\check_1_r\prediction.csv', 
           '--model-dir', r'.']
    args = parser.parse_args(argv)

    start_time = time.time()

    # load model
    model_config_filename = os.path.join(args.model_dir, 'model_config.pkl')
    with open(model_config_filename, 'rb') as fin:
        model_config = pickle.load(fin)

    # read dataset
    df = pd.read_csv(args.test_csv)
    print('Dataset read, shape {}'.format(df.shape))

    # features from datetime
    df = transform_datetime_features(df)

    # missing values
    if model_config['missing']:
        df.fillna(-1, inplace=True)
    elif any(df.isnull()):
        df.fillna(value=df.mean(axis=0), inplace=True)

    # categorical encoding
    for col_name, unique_values in model_config['categorical_values'].items():
        for unique_value in unique_values:
            df['onehot_{}={}'.format(col_name, unique_value)] = (df[col_name] == unique_value).astype(int)

    # filter columns
    used_columns = model_config['used_columns']
    df_used = df[used_columns]
    # scale
    X_scaled = model_config['scaler'].transform(df_used)

    model = model_config['model']
    if model_config['mode'] == 'regression':
        df['prediction'] = model.predict(X_scaled)
    elif model_config['mode'] == 'classification':
        df['prediction'] = model.predict_proba(X_scaled)[:, 1]

    prediction = df[['line_id', 'prediction']]
    prediction.to_csv(args.prediction_csv, index=False)

    print('Prediction time: {}'.format(time.time() - start_time))


Dataset read, shape (172, 41)
Prediction time: 0.06550836563110352


In [85]:
#scorer
from sklearn.metrics import mean_squared_error

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-target-csv', type=argparse.FileType('r'), required=True)
    parser.add_argument('--prediction-csv', type=argparse.FileType('r'), required=True)

    argv = ['--test-target-csv', r'..\check_1_r\test-target.csv',
           '--prediction-csv', r'..\check_1_r\h2o_prediction.csv']
    args = parser.parse_args(argv)

    start_time = time.time()

    test_target = pd.read_csv(args.test_target_csv)
    prediction =  pd.read_csv(args.prediction_csv)
    
    rmse = mean_squared_error(test_target, prediction) ** 0.5
    print( 'RMSE:',rmse )
    print('Scoring time: {}'.format(time.time() - start_time))    
# baseline RMSE: RMSE: 8.095668993508879
# h2o AutoML RMSE: RMSE: 10.339605784611129

RMSE: 22.187721041236983
Scoring time: 0.0175020694732666


In [31]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,18 hours 22 mins
H2O cluster timezone:,Europe/Moscow
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.8
H2O cluster version age:,4 days
H2O cluster name:,H2O_from_python_SBT_Shekhovtsov_RV_txmoqq
H2O cluster total nodes:,1
H2O cluster free memory:,3.295 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [36]:
df_h2o = df_X
df_h2o['target'] = df_y
#df_h2o_scaled = scaler.fit_transform(df_h2o)

In [48]:
train = h2o.H2OFrame(df_h2o)
test = h2o.H2OFrame(df_used)
x = train.columns
y = "target"
x.remove(y)

  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [49]:
train = train.scale()
test = test.scale()
aml = H2OAutoML(max_runtime_secs = 30)
aml.train(x = x, y = y,
          training_frame = train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [50]:
# View the AutoML Leaderboard
lb = aml.leaderboard
lb

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GBM_grid_0_AutoML_20180926_121705_model_0,0.128208,0.358061,0.128208,0.201276,
StackedEnsemble_BestOfFamily_0_AutoML_20180926_121705,0.13061,0.361399,0.13061,0.207824,
StackedEnsemble_AllModels_0_AutoML_20180926_121705,0.133861,0.365871,0.133861,0.206093,
XRT_0_AutoML_20180926_121705,0.139417,0.373386,0.139417,0.199515,
GBM_grid_0_AutoML_20180926_121705_model_8,0.141649,0.376363,0.141649,0.212985,
DRF_0_AutoML_20180926_121705,0.143858,0.379286,0.143858,0.208757,
GBM_grid_0_AutoML_20180926_121705_model_7,0.146123,0.38226,0.146123,0.250417,
GLM_grid_0_AutoML_20180926_121705_model_0,0.147317,0.383819,0.147317,0.234308,
GBM_grid_0_AutoML_20180926_121705_model_16,0.152451,0.390449,0.152451,0.205333,
GBM_grid_0_AutoML_20180926_121705_model_17,0.154075,0.392524,0.154075,0.213365,




In [None]:
import pandas as pd

preds = aml.leader.predict(test)

In [84]:
output_list = pd.DataFrame()
output_list['line_id'] = df['line_id']
output_list['prediction'] = preds.as_data_frame()['predict']
#df.index.delete
output_list.to_csv(r"..\check_1_r\h2o_prediction.csv", index=False )

In [83]:
#output_list['prediction'] = preds.as_data_frame()
#output_list.head()
output_list

Unnamed: 0,line_id
0,0
1,2
2,6
3,10
4,13
5,14
6,15
7,21
8,33
9,38
