## Machine Learning Tutorials with Watson Machine Learning
### Part 2 - Multivariate Linear Regression 

for use in IBM Data Science Experience

### 1.1 Create data frames

In [None]:
import random
import numpy as np
import pylab as pl

from pyspark.sql.types import *
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml import Pipeline, Model
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact

import urllib3, requests, json

In [None]:
x1 = [num / 1.0 for num in range(100)]
x2 = [num / 1.0 for num in range(100)]
x3 = [num / 1.0 for num in range(100)]
y = range(100)

#creating a noisy function to y = x1 + 2x2 + 3x3
for i in range(0,100):
    y[i] = x1[i] + random.random() * random.uniform(-1.5,1.5) + \
           2 * x2[i] + random.random() * random.uniform(-1.5,1.5) + \
           3 * x3[i] + random.random() * random.uniform(-1.5,1.5)

xytuple = zip(y,x1,x2,x3)

#defining schema for spark dataframe
schema = StructType([
    StructField("label", DoubleType(), True),
    StructField("x1", DoubleType(), True),
    StructField("x2", DoubleType(), True),
    StructField("x3", DoubleType(), True)
])

#creating spark dataframe, splitting to three groups
df = sqlContext.createDataFrame(xytuple, schema)
trainingdf, test1df, test2df = df.randomSplit([0.8, 0.1, 0.1])

trainingdf.show()

### 1.2 Create Spark ML Pipeline
group x values into single feature vector this will be beneficial in the next exercise, involving multiple features

In [None]:
featuresvector = VectorAssembler(inputCols=["x1", "x2", "x3"], outputCol="features")

create linear regression instance

In [None]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

multiple ML steps can be executed at once by utilizing a pipeline, here we build out a pipeline that combines features in a single vector, then executes a linear regression model on the resulting (label, feature). We then run this pipeline using our training data

In [None]:
pipeline = Pipeline(stages =[featuresvector, lr])
#pipelinemodel = pipeline.fit(trainingdf)

### 1.3 Use model on test1 dataframe

In [None]:
predictions = pipelinemodel.transform(test1df)
predictions.select('label','prediction').show()

### 1.4 Exercise
#### The code above perform a linear regression training on data of the order y=x1 + 2x2 + 3x3 , plus some noise. Create data frames and deploy a model on the function y = -x1 - 5x2 - 10x3

#### From the above code, what needs to be changed, what can stay the same?

In [None]:
### create data the looks like y = -x1 - 5x2 - 10x3, plus some noise
x1 = [num / 1.0 for num in range(100)]
x2 = [num / 1.0 for num in range(100)]
x3 = [num / 1.0 for num in range(100)]
y = range(100)

#creating a noisy function to y = -x1 - 5x2 - 10x3
for i in range(0,100):
    y[i] = x1[i] + random.random() * random.uniform(-1.5,1.5) - \
           5 * x2[i] + random.random() * random.uniform(-1.5,1.5) - \
           10 * x3[i] + random.random() * random.uniform(-1.5,1.5)
            
xytuple = zip(y,x1,x2,x3)

### convert to three pyspark dataframes: newtrainingdf, newtest1df, & newtest2df
#defining schema for spark dataframe
schema = StructType([
    StructField("label", DoubleType(), True),
    StructField("x1", DoubleType(), True),
    StructField("x2", DoubleType(), True),
    StructField("x3", DoubleType(), True)
])

#creating spark dataframe, splitting to three groups
df = sqlContext.createDataFrame(xytuple, schema)
newtrainingdf, newtest1df, newtest2df = df.randomSplit([0.8, 0.1, 0.1])

newtrainingdf.show()


In [None]:
### build model pipeline, pipelinemodel3x
#Hint - what has changed from the last time we ran a pipeline
newpipelinemodel = pipeline.fit(newtrainingdf)

### predict on test1_3x data
newpredictions = newpipelinemodel.transform(newtest1df)


### look at result by calling show() on the new prediction data frame, prediction3x
newpredictions.select('label','prediction').show()

### 1.5 Save model for y = -x1 - 5x2 - 10x3 to Watson Machine Learning

In [None]:
service_path = 'https://ibm-watson-ml.mybluemix.net'
instance_id = '***'
username = '***'
password = '***'
yourname = 'Enter Your Name Here'

modelname = modelname = name + ' y = -x1 - 5x2 - 10x3'

ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

### build model artifact by pasing model pipeline, name, and training data as parameters
model_artifact = MLRepositoryArtifact(pipelinemodel, name=modelname, training_data=training3x)
saved_model = ml_repository_client.models.save(model_artifact)

print saved_model.uid

### 1.6 Using a Watson Machine Learning Model

Ask a partner for the information below, including the model uid for their y=3x model

In [None]:
service_path = 'https://ibm-watson-ml.mybluemix.net'
partner_instance_id = '***'
partner_username = '***'
partner_password = '***'
partner_saved_model_uid = '***'

ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

load this model and print out it's name to verify it is from your partner's WML service

In [None]:
loadedModel = ml_repository_client.models.get(partner_saved_model_uid)
print str(loadedModel.name)

In [None]:
loadedModel.model_instance().transform(test2_3x).select('x','label','prediction').show()