In [1]:
#Consulting project for crew count specification for Hyundai Cruise ships manufacturing.
#The Data acquired from UCI machine learning repo

import findspark
findspark.init('/home/shoby/spark-2.4.0-bin-hadoop2.7')
import pyspark

In [2]:
#Importing SparkSession

from pyspark.sql import SparkSession

In [3]:
#Creating spark app.

spark = SparkSession.builder.appName('Cruise_Consulting').getOrCreate()

In [4]:
#Importing data file using the spark read function.

data = spark.read.csv('cruise_ship_info.csv', inferSchema = True, header = True)

In [5]:
#Checking the data.

data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [6]:
#Printing schema.

data.printSchema()

#we will want to predict the crew (double) on actual data once our model completes.

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [7]:
#Droping any na entries.

data = data.dropna('any')

In [8]:
#Importing stringIndexer which is similar to 'getdummies' in sklearn.

from pyspark.ml.feature import StringIndexer

In [9]:
#Creating stringIndexer instance and defining parameters for input and output columns.

si = StringIndexer(inputCol = 'Cruise_line', outputCol = 'Indexed_Cruise_line')

In [10]:
#fitting the data to the stringIndexer instance.

si_indexed_cruiseline = si.fit(data)

In [11]:
#Transforming the stringIndexer instance on cleaned data.

transformed_cruiseline = si_indexed_cruiseline.transform(data)

In [12]:
#Checking if transformation worked.

transformed_cruiseline.show()

#Indexed_Cruise_Liner column is now available in the dataframe.

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Indexed_Cruise_line|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.5

In [13]:
#Checking columns of the transformed dataframe so they can be selected in a final_data dataframe.

transformed_cruiseline.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_line']

In [14]:
#creating a final_data dataframe which drops "ship_name" and "Cruise_line" columns.
#This gives us all numerical data for analysis.

final_data = transformed_cruiseline.select('Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_line')

In [15]:
#Checking how the final_data dataframe looks.

final_data.show()

#Now we have to create a vector for this data so that Spark algorithm can process it.

+---+------------------+----------+------+------+-----------------+----+-------------------+
|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Indexed_Cruise_line|
+---+------------------+----------+------+------+-----------------+----+-------------------+
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|                1.0|
| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|                1.0|
| 23|            70.367|     20.56|  8.55| 10.22|            34.23| 9.

In [16]:
#to use the ML lib in Spark we need to import the following.

#from pyspark ml linear algebra import vectors.
from pyspark.ml.linalg import Vectors 

#from pyspark ml features import Vector Assembler.
from pyspark.ml.feature import VectorAssembler 

#for now we will only use numerical data.

In [17]:
#Checking final data columns.

final_data.columns

['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Indexed_Cruise_line']

In [18]:
#instializing VectorAssembler with input cols of the final_data and output col of 'vectorized_dat'

vs = VectorAssembler(inputCols = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
 'Indexed_Cruise_line'], outputCol = 'vectorized_data')

# transforming vs on the final_data

final_data = vs.transform(final_data)

In [19]:
#Checking the new final_data.

final_data.show()

+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Indexed_Cruise_line|     vectorized_data|
+---+------------------+----------+------+------+-----------------+----+-------------------+--------------------+
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[6.0,30.276999999...|
|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|[6.0,30.276999999...|
| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|[26.0,47.262,14.8...|
| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|[11.0,110.0,29.74...|
| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|[17.0,101.353,26....|
| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|                

In [20]:
#Splitting the test and train data at 70/30 ratio.

train_data, test_data = final_data.randomSplit([0.7,0.3])

In [21]:
#Checking split data:

train_data.describe().show()

#the data comes out a bit messy because of space.

+-------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-------------------+
|summary|              Age|          Tonnage|        passengers|            length|           cabins| passenger_density|             crew|Indexed_Cruise_line|
+-------+-----------------+-----------------+------------------+------------------+-----------------+------------------+-----------------+-------------------+
|  count|              109|              109|               109|               109|              109|               109|              109|                109|
|   mean|16.44954128440367|67.70890825688073|17.475596330275234| 7.983577981651367|8.532660550458719|40.097889908256874|7.572201834862394|  5.330275229357798|
| stddev|8.077731436362686|37.53552165456602| 9.475284069932277|1.8451269769292808|4.498223238165683| 9.197777416048673|3.602790677043172|  4.872551704059443|
|    min|                4|            2.329| 

In [22]:
#importing linear Regression.

from pyspark.ml.regression import LinearRegression

In [23]:
#initializing a linearRegression instance with features col as vectorized_data and lable col is crew.

lr = LinearRegression(featuresCol='vectorized_data', labelCol='crew', predictionCol='prediction')

In [24]:
#train_data.printSchema()

lr_Model = lr.fit(train_data)

In [25]:
test_results = lr_Model.evaluate(test_data)

In [28]:
test_results.rootMeanSquaredError

0.7232038863827399

In [27]:
test_results.r2

0.9495423850444498

In [29]:
test_results.meanAbsoluteError

0.595579093425735