In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("T-systems: cuarta sesion. MLib") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
df = spark.read.csv("C:\\Users\\ruben\\Desktop\\data engineering\\s5-pysparkML\\results.csv", header = True)

In [3]:
df.show(5)

+----------+----------+----------+----------+----+----+----+----+-------+-----+
|Question 1|Question 2|Question 3|Question 4|Ex01|Ex02|Ex03|Ex04|Project|Total|
+----------+----------+----------+----------+----+----+----+----+-------+-----+
|        18|        16|        23|        22| 100|  85|  80|  70|     80|   81|
|        20|        13|        11|        22| 100|  85|  80|  90|     93|   79|
|         7|         7|         0|         2| 100| 100|  85|  30|     70|   42|
|        14|        16|         4|        13|  95|  95| 100|  55|     87|   67|
|         4|         6|         0|         0|  65|  95|  65|  25|     70|   39|
+----------+----------+----------+----------+----+----+----+----+-------+-----+
only showing top 5 rows



In [4]:
#predecir total con ex01,..4, project

In [5]:
import pyspark.sql.functions as sql_f

In [6]:
df.printSchema()

root
 |-- Question 1: string (nullable = true)
 |-- Question 2: string (nullable = true)
 |-- Question 3: string (nullable = true)
 |-- Question 4: string (nullable = true)
 |-- Ex01: string (nullable = true)
 |-- Ex02: string (nullable = true)
 |-- Ex03: string (nullable = true)
 |-- Ex04: string (nullable = true)
 |-- Project: string (nullable = true)
 |-- Total: string (nullable = true)



In [7]:
df = df.select([sql_f.col(c).cast('float') for c in ['Ex01','Ex02','Ex03','Ex04','Project','Total']])

In [8]:
df.printSchema()

root
 |-- Ex01: float (nullable = true)
 |-- Ex02: float (nullable = true)
 |-- Ex03: float (nullable = true)
 |-- Ex04: float (nullable = true)
 |-- Project: float (nullable = true)
 |-- Total: float (nullable = true)



In [9]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed = 12345678 )

In [10]:
train_df.count(), test_df.count()

(39, 22)

In [11]:
from pyspark.ml.regression import LinearRegression 

In [12]:
lr = LinearRegression(maxIter = 10, regParam = 0.3)

In [13]:
type(lr)

pyspark.ml.regression.LinearRegression

In [14]:
model = lr.fit(train_df)

IllegalArgumentException: features does not exist. Available: Ex01, Ex02, Ex03, Ex04, Project, Total

In [17]:
from pyspark.ml.linalg import Vectors
simple_training =  spark.createDataFrame([
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0)], ['features', 'label'])

In [18]:
model = lr.fit(simple_training)

In [19]:
simple_test =  spark.createDataFrame([
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0),
    (Vectors.dense([0.0, 1.1, 0.1]), 1.0)], ['features', 'label'])

In [20]:
pred = model.transform(simple_test)

In [21]:
pred.show() #el predict te crea una nueva columna

+-------------+-----+----------+
|     features|label|prediction|
+-------------+-----+----------+
|[0.0,1.1,0.1]|  1.0|       1.0|
|[0.0,1.1,0.1]|  1.0|       1.0|
|[0.0,1.1,0.1]|  1.0|       1.0|
|[0.0,1.1,0.1]|  1.0|       1.0|
+-------------+-----+----------+



In [22]:
train_df.show(2)

+----+----+----+----+-------+-----+
|Ex01|Ex02|Ex03|Ex04|Project|Total|
+----+----+----+----+-------+-----+
|60.0|80.0|70.0|40.0|    0.0| 26.0|
|60.0|85.0|60.0|20.0|   70.0| 48.0|
+----+----+----+----+-------+-----+
only showing top 2 rows



In [23]:
from pyspark.ml.feature import VectorAssembler

In [24]:
columnas = ['Ex01', 'Ex02', 'Ex03', 'Ex04', 'Project']

In [25]:
assembler = VectorAssembler(inputCols = columnas, outputCol = "features")

In [26]:
df2 = assembler.transform(train_df)

In [27]:
df2.show(3, truncate = False)

+----+----+----+----+-------+-----+--------------------------+
|Ex01|Ex02|Ex03|Ex04|Project|Total|features                  |
+----+----+----+----+-------+-----+--------------------------+
|60.0|80.0|70.0|40.0|0.0    |26.0 |[60.0,80.0,70.0,40.0,0.0] |
|60.0|85.0|60.0|20.0|70.0   |48.0 |[60.0,85.0,60.0,20.0,70.0]|
|65.0|90.0|90.0|85.0|69.0   |65.0 |[65.0,90.0,90.0,85.0,69.0]|
+----+----+----+----+-------+-----+--------------------------+
only showing top 3 rows



In [28]:
lr.setLabelCol("Total")

LinearRegression_16a5e990f186

In [29]:
model = lr.fit(df2)

In [30]:
pred = model.transform(assembler.transform(test_df))
pred.show(5, truncate = False)

+----+----+-----+-----+-------+-----+---------------------------+-----------------+
|Ex01|Ex02|Ex03 |Ex04 |Project|Total|features                   |prediction       |
+----+----+-----+-----+-------+-----+---------------------------+-----------------+
|60.0|85.0|75.0 |85.0 |70.0   |53.0 |[60.0,85.0,75.0,85.0,70.0] |63.28826648908446|
|70.0|85.0|65.0 |100.0|65.0   |64.0 |[70.0,85.0,65.0,100.0,65.0]|66.04627665225328|
|80.0|80.0|65.0 |60.0 |55.0   |62.0 |[80.0,80.0,65.0,60.0,55.0] |54.0656664622132 |
|80.0|85.0|100.0|80.0 |92.0   |81.0 |[80.0,85.0,100.0,80.0,92.0]|76.519676196518  |
|80.0|90.0|90.0 |90.0 |67.0   |42.0 |[80.0,90.0,90.0,90.0,67.0] |68.23885473124211|
+----+----+-----+-----+-------+-----+---------------------------+-----------------+
only showing top 5 rows



In [31]:
from pyspark.ml import Pipeline

In [32]:
pipeline = Pipeline(stages = [assembler, lr])
pipeline_model = pipeline.fit(train_df)

In [33]:
pred = pipeline_model.transform(test_df)

In [34]:
pred.show(5, truncate = False)

+----+----+-----+-----+-------+-----+---------------------------+-----------------+
|Ex01|Ex02|Ex03 |Ex04 |Project|Total|features                   |prediction       |
+----+----+-----+-----+-------+-----+---------------------------+-----------------+
|60.0|85.0|75.0 |85.0 |70.0   |53.0 |[60.0,85.0,75.0,85.0,70.0] |63.28826648908446|
|70.0|85.0|65.0 |100.0|65.0   |64.0 |[70.0,85.0,65.0,100.0,65.0]|66.04627665225328|
|80.0|80.0|65.0 |60.0 |55.0   |62.0 |[80.0,80.0,65.0,60.0,55.0] |54.0656664622132 |
|80.0|85.0|100.0|80.0 |92.0   |81.0 |[80.0,85.0,100.0,80.0,92.0]|76.519676196518  |
|80.0|90.0|90.0 |90.0 |67.0   |42.0 |[80.0,90.0,90.0,90.0,67.0] |68.23885473124211|
+----+----+-----+-----+-------+-----+---------------------------+-----------------+
only showing top 5 rows



In [36]:
pred.select("prediction", "Total").show(5)


+-----------------+-----+
|       prediction|Total|
+-----------------+-----+
|63.28826648908446| 53.0|
|66.04627665225328| 64.0|
| 54.0656664622132| 62.0|
|  76.519676196518| 81.0|
|68.23885473124211| 42.0|
+-----------------+-----+
only showing top 5 rows



In [35]:

from pyspark.ml.evaluation import RegressionEvaluator

In [37]:
rmse_evaluator = RegressionEvaluator(metricName= "rmse",
                                     labelCol = "Total", 
                                     predictionCol = "prediction")

### calcular rmse, sql_f.pow, sql_f.mean, sql_f.sqrt, agg()...

In [38]:
rmse_evaluator.evaluate(pred)

10.909428387895298

### ampliar pipeline con StandardScaler.  -> assembler, standardscaler, lr + error y cual funciona mejor

In [39]:
from pyspark.ml.evaluation import RegressionEvaluator

In [40]:
rmse_evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'Total',
                                     predictionCol = 'prediction')

In [42]:
rmse_evaluator.evaluate(pred)

10.909428387895298

In [43]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [48]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.5, 0.1]) \
    .addGrid(lr.maxIter, [1,5,10,20]) \
.build()

In [47]:
crossval = CrossValidator(
    estimator = pipeline,
    estimatorParamMaps = paramGrid,
    evaluator = rmse_evaluator,
    numFolds = 3,
    seed = 12345678,
)

In [53]:
cv_model = crossval.fit(train_df)

In [56]:
pred = cv_model.transform(test_df)

In [57]:
rmse_evaluator.evaluate(pred)

10.905107498598843

In [58]:
bestLRModel = cv_model.bestModel.stages[1]
bestParams = bestLRModel.extractParamMap()

In [59]:
bestParams

{Param(parent='LinearRegression_16a5e990f186', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LinearRegression_16a5e990f186', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
 Param(parent='LinearRegression_16a5e990f186', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_16a5e990f186', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_16a5e990f186', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_16a5e990f186', name='labelCol', doc='label column name.'): 'Total',
 Param(parent='LinearRegression_16a5e990f186', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 'squaredError

In [60]:
from pyspark.ml.tuning import TrainValidationSplit

In [62]:
# crossval = TrainValidationSplit(
#     estimator = pipeline,
#     estimatorParamMaps = paramGrid,
#     evaluator = rmse_evaluator,
#     seed = 12345678,
# )