import argparse
import pandas as pd
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.externals import joblib


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Sagemaker specific arguments. Defaults are set in the environment variables.
    parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])

    args = parser.parse_args()

    # Take the set of files and read them all into a single pandas dataframe
    input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ]
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(args.train, "train"))
    
    train_data = pd.read_csv(input_files[0])

    # labels are in the first column
    train_y = train_data.iloc[:,0]
    train_X = train_data.iloc[:,1:]

    # Now use scikit-learn's decision tree classifier to train the model.
    rf = RandomForestRegressor(n_estimators = 200, max_depth = 4, min_samples_leaf = 20)
    rf = rf.fit(train_X, train_y)

    # Print the coefficients of the trained classifier, and save the coefficients
    joblib.dump(rf, os.path.join(args.model_dir, "model.joblib"))


def model_fn(model_dir):
    """Deserialized and return fitted model
    
    Note that this should have the same name as the serialized model in the main method
    """
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

In [3]:
ion_data = pd.read_csv('data/train_data_with_features.csv')

In [4]:
test_Data = pd.read_csv('data/test_data_with_features.csv')

In [8]:
ion_data.head()

Unnamed: 0,time,signal,open_channels,batch,simple_moving_avg_50,rolling_std_50,simple_moving_avg_50_batch,moving_std_50_batch
0,0.0001,-2.76,0,0,0.0,0.0,0.0,0.0
1,0.0002,-2.8557,0,0,0.0,0.0,0.0,0.0
2,0.0003,-2.4074,0,0,0.0,0.0,0.0,0.0
3,0.0004,-3.1404,0,0,0.0,0.0,0.0,0.0
4,0.0005,-3.1525,0,0,0.0,0.0,0.0,0.0


In [9]:
X= ion_data[['signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']].values
 
y=ion_data['open_channels'].values
 

In [10]:
test_values=test_Data[['signal','simple_moving_avg_50','rolling_std_50','simple_moving_avg_50_batch','moving_std_50_batch']].values
 

In [7]:
rf = RandomForestRegressor(n_estimators = 200, max_depth = 4, min_samples_leaf = 20)

In [11]:
rf.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=20, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=200,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [12]:
y_pred=rf.predict(test_values)

In [13]:
y_pred

array([0.00198237, 0.00198237, 0.00198237, ..., 0.00198237, 0.00198237,
       0.00198237])

In [14]:
submission5 =pd.DataFrame()
submission5['time'] = test_Data.time
submission5['open_channels'] = y_pred.astype(int)

submission5.time = submission5.time.apply(lambda x: '{:.4f}'.format(x))
submission5.to_csv('data/RandomForest_submission5.csv',index=False)

In [15]:
submission5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 2 columns):
time             object
open_channels    int64
dtypes: int64(1), object(1)
memory usage: 30.5+ MB
