## MLlib 

Machine Learning en pyspark. Similar a Scikit Learn pero en pyspark.

In [1]:
import seaborn as sns
import pandas as pd

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('regresion_diamonds').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('diamonds'))
df.show(5)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows



In [3]:
from pyspark.ml.regression import LinearRegression
help(LinearRegression)

Help on class LinearRegression in module pyspark.ml.regression:

class LinearRegression(_JavaRegressor, _LinearRegressionParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  LinearRegression(*, featuresCol: str = 'features', labelCol: str = 'label', predictionCol: str = 'prediction', maxIter: int = 100, regParam: float = 0.0, elasticNetParam: float = 0.0, tol: float = 1e-06, fitIntercept: bool = True, standardization: bool = True, solver: str = 'auto', weightCol: Optional[str] = None, aggregationDepth: int = 2, loss: str = 'squaredError', epsilon: float = 1.35, maxBlockSizeInMB: float = 0.0)
 |  
 |  Linear regression.
 |  
 |  The learning objective is to minimize the specified loss function, with regularization.
 |  This supports two kinds of loss:
 |  
 |  * squaredError (a.k.a squared loss)
 |  * huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training data

In [None]:
# Vectorizar las features de la entrada
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['carat', 'depth', 'x', 'y', 'z', 'table'],
    outputCol='features' # le llamamos features para que coincida con lo que piden los algoritmos
)
df_assembled = assembler.transform(df)
df_assembled.show(3)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|            features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|[0.23,61.5,3.95,3...|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|[0.21,59.8,3.89,3...|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|[0.23,56.9,4.05,4...|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
only showing top 3 rows



In [None]:
# seleccionar solo las columnas que usaremos para el modelado:
# Entrada "X" sería "features" para pyspark
# Salida "y" sería "label" para pyspark
df_features_label = df_assembled.withColumnRenamed('price', 'label').select('features', 'label')
df_features_label.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.23,61.5,3.95,3...|  326|
|[0.21,59.8,3.89,3...|  326|
|[0.23,56.9,4.05,4...|  327|
+--------------------+-----+
only showing top 3 rows



In [9]:
# Particionamiento de datos
df_train, df_test = df_features_label.randomSplit([0.8, 0.2], seed=42)

In [11]:
lr = LinearRegression()
model = lr.fit(df_train)
df_pred = model.transform(df_test)
df_pred.show(4)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.22,59.3,3.91,3...|  404|2.9361297378709423|
|[0.23,56.9,4.05,4...|  327|120.70671988210961|
|[0.23,59.4,4.0,4....|  338| 86.91024750544966|
|[0.23,60.5,3.96,3...|  357|-88.79676058197947|
+--------------------+-----+------------------+
only showing top 4 rows



In [13]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_r2 = RegressionEvaluator(metricName='r2')
evaluator_mae = RegressionEvaluator(metricName='mae')
evaluator_mse = RegressionEvaluator(metricName='mse')
evaluator_rmse = RegressionEvaluator(metricName='rmse')

print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.859649017378561
mae 879.5043945810185
mse 2165793.592478833
rmse 1471.6635459502397


In [18]:
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor

In [None]:
tree = DecisionTreeRegressor()
model = tree.fit(df_train)
df_pred = model.transform(df_test)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8762063316673359
mae 792.0061850657814
mse 1910293.2423885928
rmse 1382.133583409575


In [None]:
rf = RandomForestRegressor(numTrees=200)
model = rf.fit(df_train)
df_pred = model.transform(df_test)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8780712896102745
mae 789.0473691260198
mse 1881514.5770196922
rmse 1371.68311829653


In [None]:
gbt = GBTRegressor()
model = gbt.fit(df_train)
df_pred = model.transform(df_test)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8819099139029632
mae 773.688666981911
mse 1822279.7377491826
rmse 1349.9184189235966
