In [None]:
import spark

In [None]:
df = spark.read.csv('.data/datos.csv',sep = ';', header = True, inferSchema =True)
df

In [None]:
df = df.na.drop(how='any')
df

Out[10]: DataFrame[Name: string, Tiempo_en_hospital_horas: double, Edad: double, Facturacion: double, Hospital: string]

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
feature_assembler = VectorAssembler(inputCols=['Edad', 'Tiempo_en_hospital_horas'], outputCol='Independent features')
output = feature_assembler.transform(df)

In [None]:
output.show()

+--------+------------------------+----+-----------+------------------+--------------------+
|    Name|Tiempo_en_hospital_horas|Edad|Facturacion|          Hospital|Independent features|
+--------+------------------------+----+-----------+------------------+--------------------+
|   Laura|                    68.0|27.0|     9600.0|  Hospital Alberto|         [27.0,68.0]|
|   Jorge|                    42.0|39.0|     6200.0|  Hospital Alberto|         [39.0,42.0]|
|   Pablo|                    44.0|61.0|     6100.0|  Hospital Alberto|         [61.0,44.0]|
|   Mario|                    46.0|46.0|     6500.0|Hospital Alejandro|         [46.0,46.0]|
|  Marcos|                    68.0|27.0|     9200.0|     Hospital Jhon|         [27.0,68.0]|
|   María|                    61.0|36.0|     9500.0|Hospital Alejandro|         [36.0,61.0]|
|    Alba|                    45.0|62.0|     6100.0|Hospital Alejandro|         [62.0,45.0]|
|Estrella|                    65.0|62.0|     8200.0|Hospital Alejandro

In [None]:
finalized_data = output.select('Independent features', 'Facturacion')
finalized_data.show()

+--------------------+-----------+
|Independent features|Facturacion|
+--------------------+-----------+
|         [27.0,68.0]|     9600.0|
|         [39.0,42.0]|     6200.0|
|         [61.0,44.0]|     6100.0|
|         [46.0,46.0]|     6500.0|
|         [27.0,68.0]|     9200.0|
|         [36.0,61.0]|     9500.0|
|         [62.0,45.0]|     6100.0|
|         [62.0,65.0]|     8200.0|
|         [30.0,65.0]|     8500.0|
|         [39.0,40.0]|     5300.0|
|         [46.0,50.0]|     8000.0|
|         [33.0,67.0]|     9000.0|
|         [41.0,65.0]|     8400.0|
|         [71.0,44.0]|     6900.0|
|         [53.0,45.0]|     7900.0|
|         [59.0,66.0]|     8400.0|
|         [74.0,47.0]|     5400.0|
|         [40.0,42.0]|     7700.0|
|         [53.0,46.0]|     5100.0|
|         [60.0,65.0]|     8400.0|
+--------------------+-----------+



In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
train, test = finalized_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent features', labelCol='Facturacion')
regressor = regressor.fit(train)

In [None]:
regressor.coefficients

Out[22]: DenseVector([-17.2295, 112.2996])

In [None]:
regressor.intercept

Out[23]: 2168.830196321775

In [None]:
prediction = regressor.evaluate(test)

In [None]:
prediction.predictions.show()

+--------------------+-----------+-----------------+
|Independent features|Facturacion|       prediction|
+--------------------+-----------+-----------------+
|         [40.0,42.0]|     7700.0|6196.233321630829|
|         [41.0,65.0]|     8400.0|8761.895410704592|
|         [46.0,50.0]|     8000.0|6991.253171684579|
+--------------------+-----------+-----------------+



In [None]:
prediction.meanAbsoluteError, prediction.meanSquaredError

Out[26]: (958.1363057963948, 1136617.5582996393)