In [21]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [22]:
data_path = "/Users/Michavillson/Documents/PROJECTS/DS340W-Group10-FA19/sources/ML_model/output/ml_data.csv"

In [23]:
spark = SparkSession.builder.appName('epig_data_aggregation').getOrCreate()
df = spark.read.csv(data_path, header=True, inferSchema=True)
df.printSchema()

root
 |-- avo: double (nullable = true)
 |-- cur: double (nullable = true)
 |-- cell: string (nullable = true)
 |-- mark: string (nullable = true)
 |-- ideas: integer (nullable = true)
 |-- valid: double (nullable = true)



In [24]:
categoricalColumns = ['cell', 'mark', 'ideas']
stages = []
for categoricalColumn in categoricalColumns: 
    stringIndexer = StringIndexer(inputCol=categoricalColumn, outputCol=categoricalColumn+'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalColumn+"classVec"])
    stages += [stringIndexer, encoder]

label_stringIdx = StringIndexer(inputCol='valid', outputCol='label')
stages += [label_stringIdx]

numericCols = ['avo', 'cur']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [25]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
cols = df.columns
selectedCols = ['label', 'features'] + cols
df = df.select(selectedCols)
df.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- avo: double (nullable = true)
 |-- cur: double (nullable = true)
 |-- cell: string (nullable = true)
 |-- mark: string (nullable = true)
 |-- ideas: integer (nullable = true)
 |-- valid: double (nullable = true)
 |-- cellIndex: double (nullable = false)
 |-- cellclassVec: vector (nullable = true)
 |-- markIndex: double (nullable = false)
 |-- markclassVec: vector (nullable = true)
 |-- ideasIndex: double (nullable = false)
 |-- ideasclassVec: vector (nullable = true)
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)



In [27]:
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
label,0,0,0,0,0
features,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
avo,0.246995,0.248117,0.248174,0.247927,0.248312
cur,0.180174,0.180174,0.180174,0.180174,0.180174
cell,C46,C46,C46,C46,C46
mark,M03,M03,M03,M03,M03
ideas,0,92,0,0,0
valid,0.22644,0.22644,0.22644,0.22644,0.22644
cellIndex,1,1,1,1,1
cellclassVec,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"
