* Linear Regression

In [5]:
rentalsDF = spark.createDataFrame([
    ("Monday",1.5,358),
    ("Saturday",1.0,272),
    ("Saturday",0.5,390),
    ("Monday",3.0,120),
    ("Saturday",0.3,439),
    ("Monday",0.9,509),
    ("Saturday",1.9,102),
    ("Saturday",2.7,43),
    ("Monday",0.6,597),
],["weekDay","distanceCenter","rentals"])
rentalsTestDF = spark.createDataFrame([
    ("Monday",0.1,641),
    ("Saturday",2.1,129),
    ("Saturday",1.5,199),
    ("Monday",2.0,231),
    ("Sunday",0.5,393)
],["weekDay","distanceCenter","rentals"])

In [2]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import VectorAssembler

indexer = StringIndexer(inputCol="weekDay",
            outputCol="weekDayIndex", handleInvalid="keep")
indexerModel = indexer.fit(rentalsDF)
indexedDF=indexerModel.transform(rentalsDF)

va=VectorAssembler(inputCols=["weekDayIndex","distanceCenter"],
outputCol="features")
assembledDF=va.transform(indexedDF)

In [3]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol="rentals",featuresCol="features",maxIter=10)
# Fit the model
lrModel = lr.fit(assembledDF)
# Create the predictions
predictionDF=lrModel.transform(assembledDF)
predictionDF.show()

+--------+--------------+-------+------------+---------+------------------+
| weekDay|distanceCenter|rentals|weekDayIndex| features|        prediction|
+--------+--------------+-------+------------+---------+------------------+
|  Monday|           1.5|    358|         1.0|[1.0,1.5]| 396.0000000000001|
|Saturday|           1.0|    272|         0.0|[0.0,1.0]| 299.7777182645956|
|Saturday|           0.5|    390|         0.0|[0.0,0.5]|390.09507230851636|
|  Monday|           3.0|    120|         1.0|[1.0,3.0]| 125.0479378682378|
|Saturday|           0.3|    439|         0.0|[0.0,0.3]| 426.2220139260847|
|  Monday|           0.9|    509|         1.0|[1.0,0.9]|  504.380824852705|
|Saturday|           1.9|    102|         0.0|[0.0,1.9]|137.20648098553824|
|Saturday|           2.7|     43|         0.0|[0.0,2.7]|-7.301285484735047|
|  Monday|           0.6|    597|         1.0|[1.0,0.6]| 558.5712372790575|
+--------+--------------+-------+------------+---------+------------------+



In [4]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))
# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
trainingSummary.residuals.show()

Coefficients: [186.53963577932527,-180.63470808784155]
Intercept: 480.41242635243714
RMSE: 29.197016
+-------------------+
|          residuals|
+-------------------+
|-38.000000000000114|
|-27.777718264595592|
|-0.0950723085163645|
|-5.0479378682377956|
| 12.777986073915315|
|  4.619175147294982|
|-35.206480985538235|
|  50.30128548473505|
|  38.42876272094247|
+-------------------+



In [7]:
from pyspark.ml.evaluation import RegressionEvaluator
indexedTestDF=indexerModel.transform(rentalsTestDF)
assembledTestDF=va.transform(indexedTestDF)
predictionTestDF=lrModel.transform(assembledTestDF)
predictionTestDF.show()
# compute test error
evaluator = RegressionEvaluator(
 labelCol="rentals", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictionTestDF)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


+--------+--------------+-------+------------+---------+------------------+
| weekDay|distanceCenter|rentals|weekDayIndex| features|        prediction|
+--------+--------------+-------+------------+---------+------------------+
|  Monday|           0.1|    641|         1.0|[1.0,0.1]| 648.8885913229783|
|Saturday|           2.1|    129|         0.0|[0.0,2.1]|101.07953936796986|
|Saturday|           1.5|    199|         0.0|[0.0,1.5]|209.46036422067482|
|  Monday|           2.0|    231|         1.0|[1.0,2.0]|305.68264595607934|
|  Sunday|           0.5|    393|         2.0|[2.0,0.5]|  763.174343867167|
+--------+--------------+-------+------------+---------+------------------+

Root Mean Squared Error (RMSE) on test data = 169.445


* Decision tree regression

In [9]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
# Train a DecisionTree model.
dt = DecisionTreeRegressor(labelCol="rentals",featuresCol="features",maxDepth=4)
# Fit the model
dtModel = dt.fit(assembledDF)
# Predict output
predictionDF=dtModel.transform(assembledDF)
predictionDF.show()
# Compute test error
evaluator = RegressionEvaluator(
 labelCol="rentals", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictionDF)
print("Root Mean Squared Error (RMSE) on training data = %g" % rmse)

+--------+--------------+-------+------------+---------+----------+
| weekDay|distanceCenter|rentals|weekDayIndex| features|prediction|
+--------+--------------+-------+------------+---------+----------+
|  Monday|           1.5|    358|         1.0|[1.0,1.5]|     358.0|
|Saturday|           1.0|    272|         0.0|[0.0,1.0]|     272.0|
|Saturday|           0.5|    390|         0.0|[0.0,0.5]|     390.0|
|  Monday|           3.0|    120|         1.0|[1.0,3.0]|     120.0|
|Saturday|           0.3|    439|         0.0|[0.0,0.3]|     439.0|
|  Monday|           0.9|    509|         1.0|[1.0,0.9]|     509.0|
|Saturday|           1.9|    102|         0.0|[0.0,1.9]|     102.0|
|Saturday|           2.7|     43|         0.0|[0.0,2.7]|      43.0|
|  Monday|           0.6|    597|         1.0|[1.0,0.6]|     597.0|
+--------+--------------+-------+------------+---------+----------+

Root Mean Squared Error (RMSE) on training data = 0


In [10]:
predictionTestDF=lrModel.transform(assembledTestDF)
# compute test error
evaluator = RegressionEvaluator(
 labelCol="rentals", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictionTestDF)
print("Root Mean Squared Error (RMSE) on training data = %g" % rmse)

Root Mean Squared Error (RMSE) on training data = 169.445


* **Unsupervised learning: clustering**

In [23]:
data = spark.createDataFrame([
    (15000,1000,"Paolo"),
    (0,5000,"Luca"),
    (20000,800,"Martino"),
    (6000,1300,"Mike"),
    (50000,2500,"Francesca"),
    (2000,1100,"Steve"),
    (700,1500,"Maria"),
    (75000,0,"Guido"),
    (4000,500,"Roberta"),
    (7000,3000,"Idilio"),
    (3000,900,"Marco"),
    (6000,1200,"Dena"),
],["Savings","Income","User"])
dataNewDF = spark.createDataFrame([
    (10000,1860,"MARIANA"),
    (4500,1100,"Nicola"),
    (27000,1000,"Davide"),
],["Savings","Income","User"])

In [17]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
va=VectorAssembler(inputCols=["Savings","Income"],
                    outputCol="features")
assembledDF=va.transform(data)
scaler = StandardScaler(inputCol="features",
                    outputCol="scaledFeatures", withStd=True, withMean=True)
scalerModel = scaler.fit(assembledDF)
scaledDF=scalerModel.transform(assembledDF)
scaledDF.show()

+-------+------+---------+----------------+--------------------+
|Savings|Income|     User|        features|      scaledFeatures|
+-------+------+---------+----------------+--------------------+
|  15000|  1000|    Paolo|[15000.0,1000.0]|[-0.0312169519277...|
|      0|  5000|     Luca|    [0.0,5000.0]|[-0.6770849228476...|
|  20000|   800|  Martino| [20000.0,800.0]|[0.18407237171215...|
|   6000|  1300|     Mike| [6000.0,1300.0]|[-0.4187377344796...|
|  50000|  2500|Francesca|[50000.0,2500.0]|[1.47580831355180...|
|   2000|  1100|    Steve| [2000.0,1100.0]|[-0.5909691933916...|
|    700|  1500|    Maria|  [700.0,1500.0]|[-0.6469444175380...|
|  75000|     0|    Guido|   [75000.0,0.0]|[2.55225493175152...|
|   4000|   500|  Roberta|  [4000.0,500.0]|[-0.5048534639356...|
|   7000|  3000|   Idilio| [7000.0,3000.0]|[-0.3756798697517...|
|   3000|   900|    Marco|  [3000.0,900.0]|[-0.5479113286636...|
|   6000|  1200|     Dena| [6000.0,1200.0]|[-0.4187377344796...|
+-------+------+---------

* K-means clustering algorithm

In [20]:
from pyspark.ml.clustering import KMeans
# Trains a k-means model.
kmeans = KMeans(k=3,featuresCol="scaledFeatures",initMode="k-means||")
model = kmeans.fit(scaledDF)
# Make predictions
predictionsDF = model.transform(scaledDF)
predictionsDF.show()

+-------+------+---------+----------------+--------------------+----------+
|Savings|Income|     User|        features|      scaledFeatures|prediction|
+-------+------+---------+----------------+--------------------+----------+
|  15000|  1000|    Paolo|[15000.0,1000.0]|[-0.0312169519277...|         0|
|      0|  5000|     Luca|    [0.0,5000.0]|[-0.6770849228476...|         1|
|  20000|   800|  Martino| [20000.0,800.0]|[0.18407237171215...|         0|
|   6000|  1300|     Mike| [6000.0,1300.0]|[-0.4187377344796...|         0|
|  50000|  2500|Francesca|[50000.0,2500.0]|[1.47580831355180...|         2|
|   2000|  1100|    Steve| [2000.0,1100.0]|[-0.5909691933916...|         0|
|    700|  1500|    Maria|  [700.0,1500.0]|[-0.6469444175380...|         0|
|  75000|     0|    Guido|   [75000.0,0.0]|[2.55225493175152...|         2|
|   4000|   500|  Roberta|  [4000.0,500.0]|[-0.5048534639356...|         0|
|   7000|  3000|   Idilio| [7000.0,3000.0]|[-0.3756798697517...|         1|
|   3000|   

In [21]:
from pyspark.ml.evaluation import ClusteringEvaluator
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)
print("Size of the clusters: ", model.summary.clusterSizes)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictionsDF)
print("Silhouette with squared euclidean distance = " + str(silhouette))
print("SSE: ",model.computeCost(predictionsDF))

Cluster Centers: 
[-0.37191231 -0.39159297]
[-0.5263824   1.80071098]
[ 2.01403162 -0.2343391 ]
Size of the clusters:  [8, 2, 2]
Silhouette with squared euclidean distance = -0.09641007707651222
SSE:  4.404936191705297


In [24]:
assembledNewDF=va.transform(dataNewDF)
scaledNewDF=scalerModel.transform(assembledNewDF)
# Make predictions
predictionsNewDF = model.transform(scaledNewDF)
predictionsNewDF.show()

+-------+------+-------+----------------+--------------------+----------+
|Savings|Income|   User|        features|      scaledFeatures|prediction|
+-------+------+-------+----------------+--------------------+----------+
|  10000|  1860|MARIANA|[10000.0,1860.0]|[-0.2465062755677...|         0|
|   4500|  1100| Nicola| [4500.0,1100.0]|[-0.4833245315716...|         0|
|  27000|  1000| Davide|[27000.0,1000.0]|[0.48547742480807...|         0|
+-------+------+-------+----------------+--------------------+----------+



* Gaussian mixture model

In [26]:
from pyspark.ml.clustering import GaussianMixture
# Trains a GMM model.
gmm = GaussianMixture(k=3,featuresCol="scaledFeatures")
model = gmm.fit(scaledDF)
# Make predictions
predictionsDF = model.transform(scaledDF)
predictionsDF.show(truncate=False)

+-------+------+---------+----------------+-------------------------------------------+----------+------------------------------------------------------------------+
|Savings|Income|User     |features        |scaledFeatures                             |prediction|probability                                                       |
+-------+------+---------+----------------+-------------------------------------------+----------+------------------------------------------------------------------+
|15000  |1000  |Paolo    |[15000.0,1000.0]|[-0.031216951927791736,-0.4193436518555962]|1         |[2.64050592481616E-16,0.9999999999999994,2.64050592481616E-16]    |
|0      |5000  |Luca     |[0.0,5000.0]    |[-0.6770849228476208,2.5407291847721423]   |2         |[2.473983276643728E-21,2.473983276643728E-21,1.0]                 |
|20000  |800   |Martino  |[20000.0,800.0] |[0.18407237171215127,-0.5673472936869831]  |1         |[9.232934078192929E-16,0.9999999999999981,9.232934078192929E-16]  |
|600

In [27]:
print("Gaussians weights shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)
print("Size of the clusters: ", model.summary.clusterSizes)
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictionsDF)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Gaussians weights shown as a DataFrame: 
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|mean                                     |cov                                                                                          |
+-----------------------------------------+---------------------------------------------------------------------------------------------+
|[0.5500642219000528,0.8756882141690369]  |0.8570021232281327    -0.17126687121043363  
-0.17126687121043363  0.03422668436774328       |
|[-0.37191230658800156,-0.391592969012211]|0.07523188967303805    -0.019421862342788105  
-0.019421862342788105  0.046462724029206745   |
|[0.9375850044519495,0.6906836618798035]  |2.6071589741256043  -2.98721286994946  
-2.98721286994946   3.422668436773975                |
+-----------------------------------------+----------------------------------------------------------------------------------------

In [29]:
assembledNewDF=va.transform(dataNewDF)
scaledNewDF=scalerModel.transform(assembledNewDF)
# Make predictions
predictionsNewDF = model.transform(scaledNewDF)
predictionsNewDF.show()

+-------+------+-------+----------------+--------------------+----------+--------------------+
|Savings|Income|   User|        features|      scaledFeatures|prediction|         probability|
+-------+------+-------+----------------+--------------------+----------+--------------------+
|  10000|  1860|MARIANA|[10000.0,1860.0]|[-0.2465062755677...|         1|[1.84461323121284...|
|   4500|  1100| Nicola| [4500.0,1100.0]|[-0.4833245315716...|         1|[1.27379113530588...|
|  27000|  1000| Davide|[27000.0,1000.0]|[0.48547742480807...|         1|[2.43128540013928...|
+-------+------+-------+----------------+--------------------+----------+--------------------+

