In [1]:
import azureml.core
import pandas as pd
from azureml.core.workspace import Workspace
import logging
import os



Grab our workspace, and set up our compute

In [2]:
ws = Workspace.from_config()
experiment_name = 'automl_malware3-sami'
# project folder
project_folder = './automated-ml-regression-sami'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

Unnamed: 0,Unnamed: 1
SDK version,1.0.39
Subscription ID,3a859018-0a49-43f0-91f1-d33864d28a24
Workspace,teamsixamls
Resource Group,rgteamsixwesteurope
Location,westeurope
Project Directory,./automated-ml-regression-sami


In [3]:
# Create remote compute
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cluster_name = "amllowpriority"

# Verify that cluster does not exist already
try:
    aml_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', vm_priority='lowpriority', max_nodes=4)
    aml_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

aml_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [5]:
from azureml.core import Dataset

from azureml.core import Workspace, Datastore

datastore_name = 'dsssami'


ds = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name=datastore_name, 
                                             container_name='preppeddata',
                                             account_name='rgteamsixblob', 
                                             account_key='W9J/AsLqBuq1K4o4K5Jnt+Dy272etW88knKILZZBzxr6zAop+Bx5B/qC9XcvH+TE5yuv2N+lmmeHGRwCb0Ttaw==',
                                             create_if_not_exists=True)

# get Datastore from the workspace
dstore = Datastore.get(ws, datastore_name)

dataset = Dataset.auto_read_files(dstore.path("prepped_train.csv"))

In [6]:
dataset.head(3)

Unnamed: 0,EngineVersionEncoded,AppVersionEncoded,RtpStateBitfield,IsSxsPassiveMode,AVProductsInstalled,CityIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,IsProtected,IeVerIdentifier,...,Census_FlightRingEncoded,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,1,17,7,0,0.4,129210,114,225,1,41,...,1,355,20246,0,0,0,0,0,10,0
1,1,0,7,0,0.2,58607,277,75,1,137,...,2,142,3104,0,0,0,0,0,1,1
2,5,10,7,0,0.4,16658,120,118,1,117,...,0,142,68897,0,0,0,0,0,1,0


In [7]:
dataset.register(ws,name="prepared_data_sami_challenge3",exist_ok=True)
ws.datasets

{'Raw Training Data': Dataset(Name: Raw Training Data,
 Workspace: teamsixamls), 'Training Data': Dataset(Name: Training Data,
 Workspace: teamsixamls), 'hacktrainingdataset': Dataset(Name: hacktrainingdataset,
 Workspace: teamsixamls), 'train': Dataset(Name: train,
 Workspace: teamsixamls), 'training': Dataset(Name: training,
 Workspace: teamsixamls), 'Raw Training Data 2': Dataset(Name: Raw Training Data 2,
 Workspace: teamsixamls), 'train01': Dataset(Name: train01,
 Workspace: teamsixamls), 'train-thet': Dataset(Name: train-thet,
 Workspace: teamsixamls), 'Training Data v2': Dataset(Name: Training Data v2,
 Workspace: teamsixamls), '20 percent training set': Dataset(Name: 20 percent training set,
 Workspace: teamsixamls), 'prepared_data': Dataset(Name: prepared_data,
 Workspace: teamsixamls), 'prepared_data_sami': Dataset(Name: prepared_data_sami,
 Workspace: teamsixamls), 'prepared_data_sami_challenge3': Dataset(Name: prepared_data_sami_challenge3,
 Workspace: teamsixamls)}

Environment is loaded - Pull in the dataset created in Challenge 1, replacing the data source as the whole training dataset. If you would like (for time purposes), instead of using the full set, sample to 500k rows or so.

In [9]:
dataset = ws.datasets['prepared_data_sami_challenge3']

In [10]:
dataset.head(5)

Unnamed: 0,EngineVersionEncoded,AppVersionEncoded,RtpStateBitfield,IsSxsPassiveMode,AVProductsInstalled,CityIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,IsProtected,IeVerIdentifier,...,Census_FlightRingEncoded,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,1,17,7,0,0.4,129210,114,225,1,41,...,1,355,20246,0,0,0,0,0,10,0
1,1,0,7,0,0.2,58607,277,75,1,137,...,2,142,3104,0,0,0,0,0,1,1
2,5,10,7,0,0.4,16658,120,118,1,117,...,0,142,68897,0,0,0,0,0,1,0
3,0,0,7,0,0.2,116424,10,214,1,137,...,2,554,33115,0,0,0,0,0,10,1
4,1,4,7,0,0.2,13354,277,75,0,137,...,0,168,35257,0,0,0,0,1,11,1


In [11]:
from azureml.core.runconfig import DataReferenceConfiguration
dr = DataReferenceConfiguration(datastore_name=dstore.name, 
                   path_on_datastore='', 
                   path_on_compute='/tmp/azureml_runs',
                   mode='download', # download files from datastore to compute target
                   overwrite=False)

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
import pkg_resources

# create a new RunConfig object
conda_run_config = RunConfiguration(framework="python")

# Set compute target to the Linux DSVM
conda_run_config.target = aml_cluster
# set the data reference of the run coonfiguration
conda_run_config.data_references = {store.name: dr}

cd = CondaDependencies.create(pip_packages=['azureml-sdk','lightgbm'], conda_packages=['numpy','pandas'])

conda_run_config.environment.python.conda_dependencies = cd

In [None]:
project_folder = 'teamsix'
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

In [15]:
dataset.head(5)

Unnamed: 0,EngineVersionEncoded,AppVersionEncoded,RtpStateBitfield,IsSxsPassiveMode,AVProductsInstalled,CityIdentifier,GeoNameIdentifier,LocaleEnglishNameIdentifier,IsProtected,IeVerIdentifier,...,Census_FlightRingEncoded,Census_FirmwareManufacturerIdentifier,Census_FirmwareVersionIdentifier,Census_IsVirtualDevice,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections
0,1,17,7,0,0.4,129210,114,225,1,41,...,1,355,20246,0,0,0,0,0,10,0
1,1,0,7,0,0.2,58607,277,75,1,137,...,2,142,3104,0,0,0,0,0,1,1
2,5,10,7,0,0.4,16658,120,118,1,117,...,0,142,68897,0,0,0,0,0,1,0
3,0,0,7,0,0.2,116424,10,214,1,137,...,2,554,33115,0,0,0,0,0,10,1
4,1,4,7,0,0.2,13354,277,75,0,137,...,0,168,35257,0,0,0,0,1,11,1


In [21]:
%%writefile pipelines_heather/train.py

import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

df_xy = dataset.to_pandas_dataframe()

#pd.read_csv("/tmp/azureml_runs/prepped_train.csv", delimiter=",", header=None, quotechar='"')

x = df_xy.drop('HasDetections', axis=1) #features
y = df_xy['HasDetections'] #target (our label)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

#X_train_numpy = X_train.to_numpy()

# create dataset for lightgbm
lgb_train = lgb.Dataset(data=X_train, label= y_train)
lgb_eval = lgb.Dataset(data=X_test, label = y_test, reference=lgb_train) 

# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 9,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.42,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators':5
}
print('Starting Training')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

# save model to file
print ('Training Completed')
gbm.save_model('model.text')

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)



Overwriting pipelines_heather/train.py


In [None]:
%%writefile $project_folder/train.py

import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb

# Retrieve command line arguments
parser = argparse.ArgumentParser()

    
    parser.add_argument('--num_leaves', type=int,  help='data folder mounting point')
    parser.add_argument('--learning_rate', type=float,  help='data folder mounting point')
    parser.add_argument('--feature_fraction', type=float,  help='data folder mounting point')
    parser.add_argument('--bagging_fraction', type=float,  help='data folder mounting point')
    parser.add_argument('--bagging_freq', type=int,  help='data folder mounting point')
    parser.add_argument('--n_estimators', type=int,  help='data folder mounting point')
    
    parser.add_argument('--boosting_type', type=str,  help='data folder mounting point')
    parser.add_argument('--objective', type=str,  help='data folder mounting point')
    parser.add_argument('--metric', type=str,  help='data folder mounting point')
    parser.add_argument('--num_boost_round', type=int,  help='data folder mounting point')
    parser.add_argument('--valid_sets', type=str,  help='data folder mounting point')
    parser.add_argument('--early_stopping_rounds', type=int,  help='data folder mounting point')
    
    
args = parser.parse_args()


df_xy = pd.read_csv("/tmp/azureml_runs/prepped_train.csv", delimiter=",", header=None, quotechar='"')

x = df_xy.drop('HasDetections', axis=1) #features
y = df_xy['HasDetections'] #target (our label)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

#X_train_numpy = X_train.to_numpy()

# create dataset for lightgbm
lgb_train = lgb.Dataset(data=X_train, label= y_train)
lgb_eval = lgb.Dataset(data=X_test, label = y_test, reference=lgb_train) 

# specify your configurations as a dict
params = {
    'boosting_type': args.boosting_type,
    'objective': args.objective
    'metric': args.metric,
    'num_leaves': args.num_leaves,
    'learning_rate': args.learning_rate,
    'feature_fraction': args.feature_fraction,
    'bagging_fraction':args.bagging_fraction,
    'bagging_freq': args.bagging_freq,
    'verbose': args.verbose,
    'n_estimators':args.n_estimators,
    'num_boost_round':args.num_boost_round,
    'valid_sets': args.valid_sets,
    'early_stopping_rounds': args.early_stopping_rounds
}

print('Starting Training')
# train
gbm = lgb.train(params,
                lgb_train)

# save model to file
print ('Training Completed')
gbm.save_model('model.text')

# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
#print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)



In [None]:

from azureml.train.estimator import Estimator

script_params = { 
    
}

sk_est = Estimator(source_directory='./my-sklearn-proj',
                   script_params=script_params,
                   compute_target=aml_cluster,
                   entry_script='train.py',
                   conda_packages=['numpy','pandas','scikit-learn', 'lightgbm'],
                   pip_packages=['azureml-sdk'])

from azureml.train.hyperdrive import RandomParameterSampling
param_sampling = RandomParameterSampling( {
       
    'num_leaves': choice(range(6,13))
    'learning_rate': uniform(0.05, 1.5),
    'feature_fraction': uniform(0.5, 1),
    'bagging_fraction': uniform(0.2, .8),
    'bagging_freq': choice(1,2,3,4,5),
    'n_estimators':choice(5),
    
    'boosting_type': choice('gbdt'),
    'objective': choice('binary'),
    'metric': choice('auc'),
    'num_boost_round'=choice(20),
    'valid_sets'=choice('lgb_eval'),
    'early_stopping_rounds'=choice(5)
        
    }
)

from azureml.train.hyperdrive import BanditPolicy
early_termination_policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

from azureml.train.hyperdrive import HyperDriveConfig
hyperdrive_run_config = HyperDriveConfig(estimator=estimator,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="auc", 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                          max_total_runs=20,
                          max_concurrent_runs=1,
                          run_config=conda_run_config) #this is how we get the data reference

from azureml.core.experiment import Experiment

experiment = Experiment(workspace, experiment_name)
hyperdrive_run = experiment.submit(hyperdrive_run_config)

from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['Arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n learning rate:',parameter_values[3])
print('\n keep probability:',parameter_values[5])
print('\n batch size:',parameter_values[7])
