In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [2]:
spark

In [34]:
#Cargar el archivo
cruise_info = spark.read.csv("cruise_ship_info.csv", inferSchema=True, header = True)
cruise_info.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [35]:
#Para saber que columnas son las más relacionadas con crew (variable independiente), calculamos la correlación columna a columna
from pyspark.sql.functions import corr

cruise_info.select(corr('Cruise_line', 'crew')).withColumnRenamed("corr(Cruise_line, crew)", "Cruise_line correlation").show()
cruise_info.select(corr('Age', 'crew')).withColumnRenamed("corr(Age, crew)", "Age correlation").show()
cruise_info.select(corr('Tonnage', 'crew')).withColumnRenamed("corr(Tonnage, crew)", "Tonnage correlation").show()
cruise_info.select(corr('passengers', 'crew')).withColumnRenamed("corr(passengers, crew)", "passengers correlation").show()
cruise_info.select(corr('length', 'crew')).withColumnRenamed("corr(length, crew)", "length correlation").show()
cruise_info.select(corr('cabins', 'crew')).withColumnRenamed("corr(cabins, crew)", "cabins correlation").show()
cruise_info.select(corr('passenger_density', 'crew')).withColumnRenamed("corr(passenger_density, crew)", "passenger_density correlation").show()

+-----------------------+
|Cruise_line correlation|
+-----------------------+
|                   null|
+-----------------------+

+-------------------+
|    Age correlation|
+-------------------+
|-0.5306565039638852|
+-------------------+

+-------------------+
|Tonnage correlation|
+-------------------+
| 0.9275688115449388|
+-------------------+

+----------------------+
|passengers correlation|
+----------------------+
|    0.9152341306065384|
+----------------------+

+------------------+
|length correlation|
+------------------+
| 0.895856627101658|
+------------------+

+------------------+
|cabins correlation|
+------------------+
|0.9508226063578497|
+------------------+

+-----------------------------+
|passenger_density correlation|
+-----------------------------+
|         -0.15550928421699717|
+-----------------------------+



In [36]:
#Tras el resultado anterior, podemos ver que no se puede calcular la correlacion entre un string y un int, por lo tanto usamos la funcion StringIndexer de Spark para convertir el String a valor numérico
from pyspark.ml.feature import StringIndexer

index = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_indexer')
df = index.fit(cruise_info).transform(cruise_info)
df.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_indexer|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.5

In [37]:
#Metemos en una unica columna features, todas las columnas que queremos tener en cuenta
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Cruise_line_indexer', 'Age', 'Tonnage', 'passengers','length','cabins','passenger_density'], outputCol='features')
output = assembler.transform(df)
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_indexer|            features|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[16.0,6.0,30.2769...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[16.0,6.0,30.2769...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|[1.0,26.0,47.262,...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|[1.0,11.0,110.0,2...|
|    Destiny|   Carnival| 17|     

In [38]:
#Mostrar conjunto de columnas (features) con la columna variale independiente (crew)
final_data = output.select('features', 'crew')
final_data.show()

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
|[1.0,22.0,70.367,...| 9.2|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,23.0,70.367,...| 9.2|
|[1.0,19.0,70.367,...| 9.2|
|[1.0,6.0,110.2389...|11.5|
|[1.0,10.0,110.0,2...|11.6|
|[1.0,28.0,46.052,...| 6.6|
|[1.0,18.0,70.367,...| 9.2|
|[1.0,17.0,70.367,...| 9.2|
|[1.0,11.0,86.0,21...| 9.3|
|[1.0,8.0,110.0,29...|11.6|
|[1.0,9.0,88.5,21....|10.3|
|[1.0,15.0,70.367,...| 9.2|
|[1.0,12.0,88.5,21...| 9.3|
|[1.0,20.0,70.367,...| 9.2|
+--------------------+----+
only showing top 20 rows



In [39]:
#Dividimos el conjunto de datos en dos, el primero para entrenar (70%) y el segundo para test (30%)
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [40]:
#Mostrar los conjuntos de datos
train_data.describe().show()
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              114|
|   mean|7.828771929824567|
| stddev|3.553629614105743|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                44|
|   mean| 7.704545454545456|
| stddev|3.4085995586713036|
|    min|              0.59|
|    max|             13.13|
+-------+------------------+



In [41]:
#Calculamos la regresión lineal para la columna variable independiente (crew)
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol='crew')

In [42]:
#Mediante la función fit se crea el model a partir del dataFrame de entrenamiento
lr_model = lr.fit(train_data)

In [43]:
#Evaluamos el comporrtamiento del modelo mediante los casos de test
test_results = lr_model.evaluate(test_data)

In [44]:
#Resultados de la predicción de los labeled_data
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| -0.3143566882048692|
| -0.9972066817469507|
| -1.6495102374573118|
| -0.9696645393885994|
|-0.38722561223823426|
|    0.80222180758979|
| -0.5039421100212582|
| 0.37999422244067915|
|  0.5873349169921074|
| -0.7022575244772273|
|  0.6023498378136125|
|  0.6173647586351159|
|  1.1260997295797246|
|  1.0072080793421687|
|  0.2088602279699785|
| 0.30182099155396847|
|-0.07910486895578472|
|-0.06663932704881681|
|  0.2228988439423052|
|  0.0919665003543706|
+--------------------+
only showing top 20 rows



In [45]:
#Final squarred errror
test_results.rootMeanSquaredError

0.885785163916936

In [46]:
#Final r-squarred
test_results.r2

0.9308982480918682

In [47]:
unlabeled_data = test_data.select('features')

In [48]:
unlabeled_data.show()

+--------------------+
|            features|
+--------------------+
|[0.0,10.0,90.09,2...|
|[0.0,14.0,138.0,3...|
|[0.0,15.0,78.491,...|
|[0.0,16.0,74.137,...|
|[0.0,17.0,70.0,20...|
|[1.0,9.0,88.5,21....|
|[1.0,9.0,110.0,29...|
|[1.0,13.0,101.509...|
|[1.0,17.0,70.367,...|
|[1.0,17.0,101.353...|
|[1.0,18.0,70.367,...|
|[1.0,19.0,70.367,...|
|[2.0,9.0,113.0,26...|
|[2.0,11.0,108.977...|
|[2.0,12.0,108.865...|
|[2.0,14.0,30.2769...|
|[2.0,16.0,77.499,...|
|[2.0,22.0,69.845,...|
|[3.0,16.0,59.652,...|
|[3.0,17.0,55.451,...|
+--------------------+
only showing top 20 rows



In [49]:
#Hacemos las predicciones en base al modelo generado
predictions = lr_model.transform(unlabeled_data)

In [50]:
#Mostramos las predicciones
predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[0.0,10.0,90.09,2...|  8.89435668820487|
|[0.0,14.0,138.0,3...| 12.75720668174695|
|[0.0,15.0,78.491,...| 8.249510237457311|
|[0.0,16.0,74.137,...| 8.569664539388599|
|[0.0,17.0,70.0,20...| 7.587225612238234|
|[1.0,9.0,88.5,21....|  9.49777819241021|
|[1.0,9.0,110.0,29...|12.103942110021258|
|[1.0,13.0,101.509...| 11.12000577755932|
|[1.0,17.0,70.367,...| 8.612665083007892|
|[1.0,17.0,101.353...|10.702257524477227|
|[1.0,18.0,70.367,...| 8.597650162186387|
|[1.0,19.0,70.367,...| 8.582635241364883|
|[2.0,9.0,113.0,26...|11.253900270420276|
|[2.0,11.0,108.977...|10.992791920657831|
|[2.0,12.0,108.865...|10.791139772030022|
|[2.0,14.0,30.2769...|3.4281790084460315|
|[2.0,16.0,77.499,...| 9.079104868955785|
|[2.0,22.0,69.845,...| 7.026639327048817|
|[3.0,16.0,59.652,...| 6.217101156057695|
|[3.0,17.0,55.451,...| 5.788033499645629|
+--------------------+------------