## Scikit-Learn Preprocessing and Training Pipeline
##### from sklearn.feature_extraction.text import TfidfVectorizer
##### from sklearn.naive_bayes import MultinomialNB
### Using data from Azure datastore and SAP Datasphere

## Install fedml_azure package

In [None]:
pip install fedml_azure --force-reinstall

## Import the libraries needed in this notebook

In [None]:
from fedml_azure import DwcAzureTrain

## Set up
### Creating a Training object and setting the workspace, compute target, and environment.

Before running the below cell, ensure that you have a workspace and replace the subscription_id, resource_group, and workspace_name with your information.

The whl file for the fedml_azure library must be passed to the pip_wheel_files key in the environment_args and to use scikit-learn, you must pass the name to conda_packages as well.


In [None]:
#creation of training object and creating workspace in constructor.

training = DwcAzureTrain(
                          workspace_args={"subscription_id": '<subscription_id>',
                                        "resource_group": '<resource_group>',
                                        "workspace_name": '<workspace_name>'
                                        },
                          experiment_args={'name':'test-2'},
                          environment_type='CondaPackageEnvironment',
                          environment_args={'name':'test-env-prep','conda_packages':['scikit-learn'],'pip_packages':['fedml_azure']},
                          compute_type='AmlComputeCluster',
                          compute_args={'vm_size':'Standard_D12_v2',
                                'vm_priority':'lowpriority',
                                'compute_name':'cpu-clu-prep',
                                'min_nodes':0,
                                'max_nodes':1,
                                'idle_seconds_before_scaledown':1700
                                })


### Since this model is using data stored in Azure and SAP Datasphere, we need to get the data that was uploaded to Azure so we can pass it to the training script.
For information on how this specific data was uploaded to Azure, please refer to `upload_data_to_datastore.ipynb.`


In [None]:
from azureml.core import Dataset, Datastore
datastore = Datastore.get(training.workspace, 'workspaceblobstore')
datastore

In [None]:
train_dataset = Dataset.Tabular.from_delimited_files(path = [(datastore, 'dataset/imdb_train.csv')])
df = train_dataset.to_pandas_dataframe()
df.head()

### Then, we need to generate the run config. This is needed to package the configuration specified so we can submit a job for training. 

Before running the following cell, you should have a config.json file with the specified values to allow you to access to SAP Datasphere. Provide this file path to config_file_path in the below cell.

You should also have the follow view IMDB_TEST_VIEW created in your SAP Datasphere. To gather this data, please refer to https://www.kaggle.com/mantri7/imdb-movie-reviews-dataset?select=train_data+%281%29.csv and download the test dataset.

https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.scriptrunconfig?view=azure-ml-py

In [None]:
#generating the run config
src=training.generate_run_config(config_file_path='dwc_configs/config.json',
                          config_args={
                                          'source_directory':'Scikit-Learn-Preprocessor-Training-Pipeline',
                                          'script':'train_script.py',
                                          'arguments':[
                                              '--model_file_name', 'pipeline.pkl',
                                              '--table_name', 'IMDB_TEST_VIEW',
                                              '--table_size', 1,
                                              '--data', train_dataset.as_named_input('train_data'),
                                          ]
                                          }
                            )

### Submitting the job for training

In [None]:
#submitting the training run
run=training.submit_run(src)

## Register the model for deployment

In [None]:
model=training.register_model(run=run,
                           model_args={'model_name':'sklearn_pipeline_model',
                                       'model_path':'outputs/pipeline.pkl'},
                            resource_config_args={'cpu':1, 'memory_in_gb':0.5},
                            is_sklearn_model=True
                           )
print('Name:', model.name)
print('Version:', model.version)