<img src="https://cybersecurity-excellence-awards.com/wp-content/uploads/2017/06/366812.png">


<h1><center>Darwin Unsupervised Model Building (Draft Version) </center></h1>


Note:
1. Set the dataset path and server url before create the model.
2. Set the dataset path for feature importance, dataset should contain no more than 500 rows.
3. For every run, Look up job status (i.e. requested, failed, running, completed) and wait for job to complete before proceed. 
4. Resume training provides a way for additional training.
5. Comparing labels only apply if labels are available for the unsupervised. 

## Import libraries

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image
from time import sleep
import os

from amb_sdk.sdk import DarwinSdk

## Setup

In [None]:
# set data path
path = '/Set/the/path/to/your/dataset/here'

# set server url
server_url = 'https://darwin-api.sparkcognition.com/v1/'

# register user - pick username & password (only for new users)
api_key = 'KEY'
password = 'password'
ds = DarwinSdk()
status, msg = ds.auth_login(password, api_key)
if not status:
    print(msg)
status, msg = ds.auth_register_user('user', 'password')
if not status:
    print(msg)
ds.set_url(server_url)

# Login - put in the registered username & password
ds = DarwinSdk()
status, msg = ds.auth_login_user('user','password')
if not status:
    print(msg)

In [None]:
# View data snippet
df = pd.read_csv(path)
df.head()

## Upload Data

In [None]:
# Upload dataset
s.delete_all_models()
s.delete_all_datasets()
dataset_name = 'your_dataset_name'

status, dataset = ds.upload_dataset(path, dataset_name)
if not status:
    print(dataset)

## Create and Train Model 

In [None]:
# Build model
model = "your_model_name"
status, job_id = ds.create_model(dataset_names=dataset_names,
                                 model_name=model,
                                 max_epochs=20,
                                 n_clusters=2)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

In [None]:
# look up job status
a.lookup_job_status_name(job_id['job_name'])

In [None]:
# look up the model
a.lookup_model_name(job_id['model_name'])

## Extra Training (Optional)

In [None]:
# Train some more
status, job_id = ds.resume_training_model(dataset_names=dataset_name,
                                          model_name=model,
                                          n_clusters=2,
                                          supervised=False)
if status:
    ds.wait_for_job(job_id['job_name'])
else:
    print(job_id)

## Predict 

In [None]:
# Test model
status, artifact = ds.run_model(dataset_name, 
                                model, 
                                supervised=False)
sleep(1)
ds.wait_for_job(artifact['job_name'])

In [None]:
# Get predictions
status, pred_file = ds.download_artifact(artifact['artifact_name'])
prediction = pd.read_csv(pred_file['filename'])

## View Prediction

In [None]:
prediction.head()

In [None]:
#Plot predictions vs actual
prediction['prediction'].plot(title='Clusters', marker='.', linewidth=0)
plt.xlabel('sample')
plt.figure()
prediction['anomaly'].plot(title='Anomaly Score')

##  Comparing labels (if exist) to clusters (Optional)

In [None]:
#Comparing labels to clusters from unsupervised
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(df['class'], prediction['prediction'])
print(cm)
print(classification_report(df['class'], prediction['prediction']))

## Analyze Model

In [None]:
status, analyze_id = ds.analyze_model(job_id['model_name'], 
                                      job_name='Darwin_analyze_model_job', 
                                      artifact_name='Darwin_analyze_model_artifact')

In [None]:
ds.wait_for_job('Darwin_analyze_model_job')

In [None]:
ds.lookup_job_status_name('Darwin_analyze_model_job')

In [None]:
ds.download_artifact('Darwin_analyze_model_artifact')

## Analyze Data

In [None]:
ds.analyze_data(dataset_name, 
                job_name = 'Darwin_analyze_data_job', 
                artifact_name = 'Darwin_analyze_data_artifact')

In [None]:
ds.wait_for_job('Darwin_analyze_data_job')

In [None]:
ds.lookup_job_status_name('Darwin_analyze_data_job')

In [None]:
ds.download_artifact('Darwin_analyze_data_artifact')

## Featuer Importance (by rows) 

In [None]:
# Upload the data that you are interested in feature importance (max: 500 rows)
path = '/Set/the/path/to/the/dataset.csv'
status, response = ds.upload_dataset(path)
print(status)
print(response)
if status == True:
    dataset_by_row=response['dataset_name']
else:
    print("Upload data failed!")

ds.analyze_predictions(job_id['model_name'], 
                       dataset_by_row, 
                       job_name='Analyze_prediction_job', 
                       artifact_name='Analyze_prediction_artifact')

In [None]:
ds.wait_for_job('Analyze_prediction_job')

In [None]:
ds.lookup_job_status_name('Analyze_prediction_job')

In [None]:
ds.download_artifact('Analyze_prediction_artifact')