## Machine Learning Tutorials with Watson Machine Learning
### Part 2 - Multivariate Linear Regression 

for use in IBM Data Science Experience

### 1.1 Create data frames

In [1]:
import random
import numpy as np
import pylab as pl

from pyspark.sql.types import *
from pyspark.ml.linalg import SparseVector, VectorUDT
from pyspark.ml import Pipeline, Model
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

from repository.mlrepositoryclient import MLRepositoryClient
from repository.mlrepositoryartifact import MLRepositoryArtifact

import urllib3, requests, json

In [3]:
x1 = [num / 1.0 for num in range(100)]
x2 = [num / 1.0 for num in range(100)]
x3 = [num / 1.0 for num in range(100)]
y = range(100)

for i in range(0,100):
    y[i] = x1[i] + random.random() * random.uniform(-1.5,1.5) + \
           2 * x2[i] + random.random() * random.uniform(-1.5,1.5) + \
           3 * x3[i] + random.random() * random.uniform(-1.5,1.5)

xytuple = zip(y,x1,x2,x3)


schema = StructType([
    StructField("label", DoubleType(), True),
    StructField("x1", DoubleType(), True),
    StructField("x2", DoubleType(), True),
    StructField("x3", DoubleType(), True)
])

df = sqlContext.createDataFrame(xytuple, schema)
trainingdf, test1df, test2df = df.randomSplit([0.8, 0.1, 0.1])

trainingdf.show()

+------------------+----+----+----+
|             label|  x1|  x2|  x3|
+------------------+----+----+----+
|1.2893073437915807| 0.0| 0.0| 0.0|
| 4.598080974185647| 1.0| 1.0| 1.0|
|12.161442572193826| 2.0| 2.0| 2.0|
| 19.58362605290612| 3.0| 3.0| 3.0|
| 24.67758381796672| 4.0| 4.0| 4.0|
|30.486009867208384| 5.0| 5.0| 5.0|
| 35.65789522080228| 6.0| 6.0| 6.0|
| 40.94325750297908| 7.0| 7.0| 7.0|
| 54.51674647593322| 9.0| 9.0| 9.0|
| 66.09696086066532|11.0|11.0|11.0|
|  72.4114916558236|12.0|12.0|12.0|
| 77.19666817685851|13.0|13.0|13.0|
| 85.75529601258764|14.0|14.0|14.0|
| 90.08228553533782|15.0|15.0|15.0|
| 95.90588609542836|16.0|16.0|16.0|
|102.46249621905902|17.0|17.0|17.0|
|108.83463298031684|18.0|18.0|18.0|
|118.55706616173079|20.0|20.0|20.0|
|126.64086531630691|21.0|21.0|21.0|
| 130.7632000502886|22.0|22.0|22.0|
+------------------+----+----+----+
only showing top 20 rows



### 1.2 Create Spark ML Pipeline
group x values into single feature vector this will be beneficial in the next exercise, involving multiple features

In [4]:
featuresvector = VectorAssembler(inputCols=["x1", "x2", "x3"], outputCol="features")

create linear regression instance

In [5]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

multiple ML steps can be executed at once by utilizing a pipeline, here we build out a pipeline that combines features in a single vector, then executes a linear regression model on the resulting (label, feature). We then run this pipeline using our training data

In [6]:
pipeline = Pipeline(stages =[featuresvector, lr])
pipelinemodel = pipeline.fit(trainingdf)

### 1.3 Use model on test1 dataframe

In [7]:
predictions = pipelinemodel.transform(test1df)
predictions.select('label','prediction').show()

+------------------+------------------+
|             label|        prediction|
+------------------+------------------+
| 59.55696932084194| 60.45981050560152|
|114.55018658700274|114.34873785121408|
|149.12132404935556| 150.2746894149558|
|228.69183616556722|228.11425113639615|
|247.59997960516515|  246.077226918267|
|449.22299610168005|449.65761911280333|
|473.36216615059374| 473.6082534886311|
|497.49228828006767| 497.5588878644589|
| 568.5446163235997| 569.4107909919423|
+------------------+------------------+



### 1.4 Exercise
#### The code above perform a linear regression training on data of the order y=x1 + 2x2 + 3x3 , plus some noise. Create data frames and deploy a model on the function y = -x1 - 5x2 - 10x3

#### From the above code, what needs to be changed, what can stay the same?

In [None]:
### create data the looks like y = -x1 - 5x2 - 10x3, plus some noise

### convert to three pyspark dataframes: newtrainingdf, newtest1df, & newtest2df



In [None]:
### build model pipeline, pipelinemodel3x
raise NotImplementedError
newpipelinemodel = pipeline.fit(newtrainingdf)

### predict on test1_3x data
newpredictions = newpipelinemodel.transform(newtest1df)


### look at result by calling show() on the new prediction data frame, prediction3x
raise NotImplementedError

### 1.5 Save model for y = -x1 - 5x2 - 10x3 to Watson Machine Learning

In [5]:
service_path = 'https://ibm-watson-ml.mybluemix.net'
instance_id = '***'
username = '***'
password = '***'
yourname = 'Enter Your Name Here'

modelname = modelname = name + ' y = -x1 - 5x2 - 10x3'

ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

### build model artifact by pasing model pipeline, name, and training data as parameters
model_artifact = MLRepositoryArtifact(pipelinemodel, name=modelname, training_data=training3x)
saved_model = ml_repository_client.models.save(model_artifact)

print saved_model.uid

bbea871f-a1e0-4e03-8ddd-df80d469af43


### 1.6 Using a Watson Machine Learning Model

Ask a partner for the information below, including the model uid for their y=3x model

In [None]:
service_path = 'https://ibm-watson-ml.mybluemix.net'
partner_instance_id = '***'
partner_username = '***'
partner_password = '***'
partner_saved_model_uid = '***'

ml_repository_client = MLRepositoryClient(service_path)
ml_repository_client.authorize(username, password)

load this model and print out it's name to verify it is from your partner's WML service

In [6]:
loadedModel = ml_repository_client.models.get(partner_saved_model_uid)
print str(loadedModel.name)

xy linear regression


In [7]:
loadedModel.model_instance().transform(test2_3x).select('x','label','prediction').show()

+----+------------------+------------------+
|   x|             label|        prediction|
+----+------------------+------------------+
| 3.0|               3.0| 3.796598221363567|
|16.0|14.556700709207545|16.590994893460135|
|27.0|27.201042858638854| 27.41702284677261|
|64.0| 66.80354453415212| 63.83184414427822|
|69.0|  69.0006978029391| 68.75276594123844|
|72.0|              72.0| 71.70531901941456|
+----+------------------+------------------+

