In [None]:
%cd /content
!rm -rf sample_data

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 28 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 50.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=615b64e1f877faac1f191fff65618a26e02344c9e2e0d462ebd1088f411f65c9
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1
/content


In [52]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lrex').getOrCreate()

In [53]:
data = spark.read.csv('cruise_ship_info.csv', inferSchema=True, header=True)

In [58]:
data.printSchema()
data.show(10)

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|  

In [60]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors

In [82]:
data.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [68]:
stringIndexer = StringIndexer(inputCol="Cruise_line", outputCol="ind_Cruise_line")
ind_data = stringIndexer.fit(data).transform(data)
ind_data.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|ind_Cruise_line|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+

In [70]:
ind_data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'ind_Cruise_line']

In [69]:
assembler = VectorAssembler(inputCols=['ind_Cruise_line', 'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew'],
                            outputCol='features')

In [72]:
output = assembler.transform(ind_data)
output.select('features').show(5, truncate=False)

+-------------------------------------------------------+
|features                                               |
+-------------------------------------------------------+
|[16.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64,3.55]|
|[16.0,6.0,30.276999999999997,6.94,5.94,3.55,42.64,3.55]|
|[1.0,26.0,47.262,14.86,7.22,7.43,31.8,6.7]             |
|[1.0,11.0,110.0,29.74,9.53,14.88,36.99,19.1]           |
|[1.0,17.0,101.353,26.42,8.92,13.21,38.36,10.0]         |
+-------------------------------------------------------+
only showing top 5 rows



In [73]:
final_data = output.select('features','crew')
final_data.show(5)

+--------------------+----+
|            features|crew|
+--------------------+----+
|[16.0,6.0,30.2769...|3.55|
|[16.0,6.0,30.2769...|3.55|
|[1.0,26.0,47.262,...| 6.7|
|[1.0,11.0,110.0,2...|19.1|
|[1.0,17.0,101.353...|10.0|
+--------------------+----+
only showing top 5 rows



In [74]:
train, test = final_data.randomSplit([0.7,0.3])
train.describe().show()
test.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              110|
|   mean|7.659272727272734|
| stddev|3.556732783057374|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                48|
|   mean| 8.103333333333332|
| stddev|3.3946330189481944|
|    min|              0.88|
|    max|              19.1|
+-------+------------------+



In [75]:
lr = LinearRegression(labelCol='crew')
lr_model = lr.fit(train)

In [76]:
test_results = lr_model.evaluate(test)
test_results.residuals.show()



+--------------------+
|           residuals|
+--------------------+
|1.598721155460225...|
|1.776356839400250...|
|-3.55271367880050...|
|-5.32907051820075...|
|-2.30926389122032...|
|-2.66453525910037...|
|-8.88178419700125...|
|-2.66453525910037...|
|1.953992523340275...|
|5.506706202140776...|
|7.105427357601002...|
|1.598721155460225...|
|1.953992523340275...|
|-3.55271367880050...|
|1.598721155460225...|
|1.065814103640150...|
|8.881784197001252...|
|-1.24344978758017...|
|1.421085471520200...|
|7.993605777301127...|
+--------------------+
only showing top 20 rows



In [77]:
test_results.rootMeanSquaredError

3.1464002577508116e-14

In [78]:
test_results.r2

1.0