In [None]:

from pyspark.sql import SparkSession

# @hidden_cell
# This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def set_hadoop_config_with_credentials_b3df59f4cfa4437ba5a0d8341462d910(name):
    """This function sets the Hadoop configuration so it is possible to
    access data from Bluemix Object Storage using Spark"""

    prefix = 'fs.swift.service.' + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', 'aef925cbc55c424b9be33df912da34c9')
    hconf.set(prefix + '.username', '482344af2f77465fb158947814a1d548')
    hconf.set(prefix + '.password', 'ExnZeWW7-2,.,S#t')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

# you can choose any name
name = 'keystone'
set_hadoop_config_with_credentials_b3df59f4cfa4437ba5a0d8341462d910(name)

spark = SparkSession.builder.getOrCreate()

df_data_1 = spark.read\
  .format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat')\
  .option('header', 'true')\
  .option('inferschema', 'true')\
  .load('swift://DefaultProjectsushidharjayaramanmavsutaedu.' + name + '/white.csv')
df_data_1.take(5)


In [None]:
stratified_CV_data = df_data_1.sampleBy('quality', fractions={'Low': 1060./1640, 'High': 1.0, 'Medium' : 1060./2198}).cache()

stratified_CV_data.groupby('quality').count().toPandas()

In [None]:
#final_CV_data = stratified_CV_data.drop('total sulfur dioxide').drop('density').cache()
final_CV_data = stratified_CV_data.drop('free sulfur dioxide').drop('density').cache()
#final_CV_data = stratified_CV_data.drop('total sulfur dioxide').drop('residual sugar').cache()
#final_CV_data = stratified_CV_data.drop('free sulfur dioxide').drop('residual sugar').cache()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
indexer = StringIndexer(inputCol="quality", outputCol="qualityIndex")
indexed = indexer.fit(final_CV_data).transform(final_CV_data)
indexed = indexed.drop('quality').cache()
indexed.show()

In [None]:

stages = [] # stages in our Pipeline
  
label_stringIdx = StringIndexer(inputCol = "quality", outputCol = "label")
stages += [label_stringIdx]

#numericCols = ["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","free sulfur dioxide","pH","sulphates","alcohol"]
numericCols = ["fixed acidity","volatile acidity","citric acid","residual sugar","chlorides","total sulfur dioxide","free sulfur dioxide","density","pH","sulphates","alcohol"]
#numericCols = ["fixed acidity","volatile acidity","citric acid","chlorides","free sulfur dioxide","density","pH","sulphates","alcohol"]
#numericCols = ["fixed acidity","volatile acidity","citric acid","chlorides","total sulfur dioxide","density","pH","sulphates","alcohol"]
#assemblerInputs = map(lambda c: c + "classVec", categoricalColumns) + numericCols
assembler = VectorAssembler(inputCols=numericCols, outputCol="features")
stages += [assembler]

cols = df_data_1.columns
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(df_data_1)
dataset = pipelineModel.transform(df_data_1)

# Keep relevant columns
selectedcols = ["label", "features"] + cols
dataset = dataset.select(selectedcols)
#display(dataset)
#type(dataset)
dataset.toPandas()

In [None]:
# Import LinearRegression class
from pyspark.ml.regression import LinearRegression
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
# Define LinearRegression algorithm
lr = LinearRegression()

# Fit 2 models, using different regularization parameters
modelA = lr.fit(trainingData, {lr.regParam:0.0})
modelB = lr.fit(trainingData, {lr.regParam:100.0})

In [None]:
predictionsA = modelA.transform(testData)

In [None]:
predictionsB = modelB.transform(testData)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print("ModelA: Root Mean Squared Error = " + str(RMSE))

In [None]:

RMSE = evaluator.evaluate(predictionsB)
print("ModelB: Root Mean Squared Error = " + str(RMSE))