In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://mirrors.sonic.net/apache/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xzf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q findspark


import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"


import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
df = spark.read.format('csv').options(inferSchema=True,header=True).load('/content/drive/MyDrive/Colab Notebooks/Crew/cruise_ship_info.csv')

In [None]:
df.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [None]:
df.describe().show()

+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|summary|Ship_name|Cruise_line|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|
+-------+---------+-----------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+
|  count|      158|        158|               158|               158|              158|              158|               158|              158|              158|
|   mean| Infinity|       null|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|
| stddev|     null|       null| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|
|    min|Adventure|    Azamara|   

In [None]:
df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



In [None]:
df.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [None]:
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_lineIndex')
indexed = indexer.fit(df).transform(df)

In [None]:
indexed.show(1)

+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------------+
|Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_lineIndex|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------------+
|  Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|            16.0|
+---------+-----------+---+------------------+----------+------+------+-----------------+----+----------------+
only showing top 1 row



In [None]:
indexed.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_lineIndex']

In [None]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Age',
                                       'Tonnage',
                                       'passengers',
                                       'length',
                                       'cabins',
                                       'passenger_density',
                                       'Cruise_lineIndex'],
                           outputCol='features')

In [None]:
output = assembler.transform(indexed)

In [None]:
final_data = output.select(['features','crew'])
train,test = final_data.randomSplit([0.7, 0.3])

In [None]:
train.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               112|
|   mean| 7.594285714285723|
| stddev|3.6942099118196925|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(labelCol='crew')
model = lr.fit(train)

In [None]:
result = model.evaluate(test)

In [None]:
result.predictions.show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,115.0,35.74,...| 12.2|12.201331015897093|
|[6.0,90.0,20.0,9....|  9.0|10.281607025554854|
|[6.0,112.0,38.0,9...| 10.9|11.622999669733208|
|[7.0,89.6,25.5,9....| 9.87|11.384326270595423|
|[7.0,116.0,31.0,9...| 12.0|12.774416775842035|
|[8.0,110.0,29.74,...| 11.6|  12.2258460250553|
|[9.0,81.0,21.44,9...| 10.0| 9.684645433315179|
|[9.0,85.0,19.68,9...| 8.69| 9.456044903335886|
|[10.0,86.0,21.14,...|  9.2|  9.78779979818717|
|[10.0,91.62700000...|  9.0| 9.156591836732177|
|[10.0,110.0,29.74...| 11.6|12.173979571980174|
|[11.0,86.0,21.24,...|  9.3| 9.541853963414377|
|[11.0,108.977,26....| 12.0| 11.05281578352847|
|[12.0,42.0,14.8,7...|  6.8| 7.010633747354531|
|[12.0,50.0,7.0,7....| 4.45| 4.422851657320788|
|[12.0,58.6,15.66,...|  7.0|7.5886929239827285|
|[12.0,77.104,20.0...| 9.59| 8.867174096992981|
|[12.0,88.5,21.24,...|10.29| 9.461241913

In [None]:
result.rootMeanSquaredError

0.7892726321215242

In [None]:
result.r2

0.9277964694383092

In [None]:
test.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               46|
|   mean|8.280869565217392|
| stddev|2.969754068446264|
|    min|              1.6|
|    max|            13.13|
+-------+-----------------+



In [None]:
from pyspark.sql.functions import corr
df.select(corr('crew', 'passengers'), corr('crew', 'cabins')).show()

+----------------------+------------------+
|corr(crew, passengers)|corr(crew, cabins)|
+----------------------+------------------+
|    0.9152341306065384|0.9508226063578497|
+----------------------+------------------+



In [None]:
model.transform(test).show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,86.0,21.04,9...|  8.0| 9.293595502226285|
|[5.0,133.5,39.59,...|13.13|13.210292404474307|
|[6.0,112.0,38.0,9...| 10.9|11.535156467269488|
|[9.0,59.058,17.0,...|  7.4| 7.583910638924453|
|[9.0,81.0,21.44,9...| 10.0| 9.563525641013848|
|[9.0,90.09,25.01,...| 8.69| 9.403498608364483|
|[9.0,113.0,26.74,...|12.38|11.300512568834295|
|[9.0,113.0,26.74,...|12.38|11.300512568834295|
|[10.0,81.76899999...| 8.42| 8.803536478146047|
|[10.0,105.0,27.2,...|10.68|11.211376781826578|
|[10.0,151.4,26.2,...|12.53|10.826016272371906|
|[11.0,58.6,15.66,...|  7.6| 7.414907169813991|
|[11.0,90.09,25.01...| 8.48| 9.009265666743765|
|[11.0,91.0,20.32,...| 9.99| 9.142237596344817|
|[11.0,108.977,26....| 12.0|11.048323468936674|
|[12.0,25.0,3.88,5...| 2.87|3.1347598206120235|
|[12.0,77.104,20.0...| 9.59| 8.796912502102877|
|[12.0,90.09,25.01...| 8.68| 8.995948757