# Prediction of number of crew members needed for the future ships builded by the Ship manufacturing company

### Initialize and create a spark session

In [1]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Cruise_Ship").getOrCreate()

2019-12-26 16:49:42 WARN  SparkContext:66 - Using an existing SparkContext; some configuration may not take effect.


import org.apache.spark.sql.SparkSession
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7dc32aca


### Initialize Logger

In [2]:
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.log4j._


### Import statements to setup ML for Linear Regression

In [3]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


### Using Spark to read in the Cruise Shio Info csv file

In [4]:
val data = spark.read.options(Map(("header","true"),("inferSchema","true"))).csv("cruise_ship_info.csv")

data: org.apache.spark.sql.DataFrame = [Ship_name: string, Cruise_line: string ... 7 more fields]


### Print the count of the Dataframe

In [5]:
data.count()

res1: Long = 158


### Print the count of the Dataframe by dropping duplicates

In [6]:
data.na.drop().count()

res2: Long = 158


### ### Printing the first 5 rows of the dataframe

In [7]:
for (row <- data.head(5)){
    println(row)
}

[Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55]
[Quest,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55]
[Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7]
[Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1]
[Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0]


### Printing the schema of the dataframe

In [8]:
data.printSchema

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



### Show

In [9]:
data.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
only showing top 5 rows



### See an example of what the data looks like by printing out a Row

In [10]:
val colnames = data.columns
val firstRow = data.head(1)(0)

for (i <- Range(0,colnames.size)){
    println("Column Name: " + colnames(i))
    println("Column Data: " + firstRow(i))
    println()
}

Column Name: Ship_name
Column Data: Journey

Column Name: Cruise_line
Column Data: Azamara

Column Name: Age
Column Data: 6

Column Name: Tonnage
Column Data: 30.276999999999997

Column Name: passengers
Column Data: 6.94

Column Name: length
Column Data: 5.94

Column Name: cabins
Column Data: 3.55

Column Name: passenger_density
Column Data: 42.64

Column Name: crew
Column Data: 3.55



colnames: Array[String] = Array(Ship_name, Cruise_line, Age, Tonnage, passengers, length, cabins, passenger_density, crew)
firstRow: org.apache.spark.sql.Row = [Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55]


**Filtering the string columns and converting the dataframe to ML acceptable format ---> i.e., ("label","features")**

**Dealing with String and Categorical Columns**

### Ship Name is a useless arbitrary string, but the cruise_line itself may be useful. So making it into a categorical variable!

In [12]:
data.groupBy("Cruise_line").count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



### Converting 'Cruise_line' column from string to a categorical numerical value using String Indexer

In [13]:
import org.apache.spark.ml.feature.StringIndexer

import org.apache.spark.ml.feature.StringIndexer


In [14]:
val indexer = new StringIndexer().setInputCol("Cruise_line").setOutputCol("Cruise_line_Ind")

indexer: org.apache.spark.ml.feature.StringIndexer = strIdx_2a3cca60b43f


In [16]:
val indexer_model = indexer.fit(data)

indexer_model: org.apache.spark.ml.feature.StringIndexerModel = strIdx_2a3cca60b43f


In [17]:
val indexed_df = indexer_model.transform(data)

indexed_df: org.apache.spark.sql.DataFrame = [Ship_name: string, Cruise_line: string ... 8 more fields]


In [18]:
indexed_df.show(5)

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_Ind|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+---------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|           16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|            1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|            1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|            1.0|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+

### Using Assembler, converting the input cols to a vector col

In [19]:
val assembler = new VectorAssembler().setInputCols(Array("Age","Tonnage","passengers","length","cabins","passenger_density"
                                                         ,"Cruise_line_Ind")).setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_5891c1791e6a


In [20]:
val output = assembler.transform(indexed_df)

output: org.apache.spark.sql.DataFrame = [Ship_name: string, Cruise_line: string ... 9 more fields]


In [21]:
val final_data = output.select("crew","features")

final_data: org.apache.spark.sql.DataFrame = [crew: double, features: vector]


In [23]:
final_data.show(10,false)

+----+--------------------------------------------------+
|crew|features                                          |
+----+--------------------------------------------------+
|3.55|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|
|3.55|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|
|6.7 |[26.0,47.262,14.86,7.22,7.43,31.8,1.0]            |
|19.1|[11.0,110.0,29.74,9.53,14.88,36.99,1.0]           |
|10.0|[17.0,101.353,26.42,8.92,13.21,38.36,1.0]         |
|9.2 |[22.0,70.367,20.52,8.55,10.2,34.29,1.0]           |
|9.2 |[15.0,70.367,20.52,8.55,10.2,34.29,1.0]           |
|9.2 |[23.0,70.367,20.56,8.55,10.22,34.23,1.0]          |
|9.2 |[19.0,70.367,20.52,8.55,10.2,34.29,1.0]           |
|11.5|[6.0,110.23899999999999,37.0,9.51,14.87,29.79,1.0]|
+----+--------------------------------------------------+
only showing top 10 rows



### Splitting the resultant data into training data and testing data

<code>
<b>Training data is to train the model</b>
<b>Testing data is to test the builted model</b>
</code>

In [24]:
//#Splitting the total data to 70% and 30% for training data and testing data respectively
val Array(train_data, test_data) = final_data.randomSplit(Array(0.7,0.3))

train_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [crew: double, features: vector]
test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [crew: double, features: vector]


In [25]:
final_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              158|
|   mean|7.794177215189873|
| stddev|3.503486564627034|
|    min|             0.59|
|    max|             21.0|
+-------+-----------------+



In [26]:
train_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|              105|
|   mean|7.957523809523809|
| stddev| 3.36896462665871|
|    min|             0.59|
|    max|             19.1|
+-------+-----------------+



In [27]:
test_data.describe().show()

+-------+-----------------+
|summary|             crew|
+-------+-----------------+
|  count|               53|
|   mean|7.470566037735849|
| stddev|3.768134824824498|
|    min|              0.6|
|    max|             21.0|
+-------+-----------------+



### Creating a linear regression model object

In [28]:
val lr = new LinearRegression().setLabelCol("crew").setFeaturesCol("features")

lr: org.apache.spark.ml.regression.LinearRegression = linReg_a08434fdd16f


### Creating a linear regression model and fitting the training data to it

In [29]:
val lrModel = lr.fit(train_data)

2019-12-26 17:16:21 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2019-12-26 17:16:21 WARN  BLAS:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
2019-12-26 17:16:21 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
2019-12-26 17:16:21 WARN  LAPACK:61 - Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK


lrModel: org.apache.spark.ml.regression.LinearRegressionModel = linReg_a08434fdd16f


### Getting the training summary of the created model

In [30]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@7bfbcecc


### Residuals

In [31]:
trainingSummary.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|0.24395580140449524|
|0.23949456128490698|
|-0.4593602314340878|
|-0.4004285696495945|
|0.48236108881225515|
+-------------------+
only showing top 5 rows



### Errors

In [33]:
println(s"Mean Absolute Error: ${trainingSummary.meanAbsoluteError}")
println(s"Mean Squared Error: ${trainingSummary.meanSquaredError}")
println(s"Root Mean Squared Error: ${trainingSummary.rootMeanSquaredError}")
println(s"R Squared Error: ${trainingSummary.r2}")

Mean Absolute Error: 0.6050960895951126
Mean Squared Error: 0.8916442795139045
Root Mean Squared Error: 0.944269177466841
R Squared Error: 0.9206851174663102


### Evaluating the model against test data

In [34]:
val test_results = lrModel.evaluate(test_data)

test_results: org.apache.spark.ml.regression.LinearRegressionSummary = org.apache.spark.ml.regression.LinearRegressionSummary@13457e83


### Getting the co-effecients and intercept

In [35]:
println(s"Coefficients: ${lrModel.coefficients}")
println(s"Intercept: ${lrModel.intercept}")

Coefficients: [-0.016319964334064182,-0.0015206493724506448,-0.10056647424652906,0.44612401195887796,0.8291686417330928,0.00947293979899868,0.039733289233948685]
Intercept: -1.5391603951331094


### Getting the residuals

In [37]:
test_results.residuals.show(5)

+--------------------+
|           residuals|
+--------------------+
|  0.2662818975210043|
|-0.04204844629073201|
|0.006911446711460156|
| -1.0091376218505708|
| -0.2055175103656497|
+--------------------+
only showing top 5 rows



### Evaluating the model by checking the different types of error

In [38]:
println(s"Mean Absolute Error: ${test_results.meanAbsoluteError}")
println(s"Mean Squared Error: ${test_results.meanSquaredError}")
println(s"Root Mean Squared Error: ${test_results.rootMeanSquaredError}")
println(s"R Squared Error: ${test_results.r2}")

Mean Absolute Error: 0.6267717833657082
Mean Squared Error: 0.9210875244473135
Root Mean Squared Error: 0.9597330485334521
R Squared Error: 0.9338818704744329


### Checking the correlation between crew members with other features

In [39]:
data.select(corr("crew","passengers")).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [41]:
data.select(corr("crew","cabins")).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



### Getting the predictions from the builted model without label column

In [42]:
val unlablled_data = test_data.select("features")

unlablled_data: org.apache.spark.sql.DataFrame = [features: vector]


In [43]:
val predictions = lrModel.transform(unlablled_data)

predictions: org.apache.spark.sql.DataFrame = [features: vector, prediction: double]


In [45]:
predictions.show(false)

+---------------------------------------------------+------------------+
|features                                           |prediction        |
+---------------------------------------------------+------------------+
|[12.0,2.329,0.94,2.96,0.45,24.78,6.0]              |0.3337181024789957|
|[24.0,10.0,2.08,4.4,1.04,48.08,13.0]               |1.642048446290732 |
|[27.0,10.0,2.08,4.4,1.04,48.08,13.0]               |1.59308855328854  |
|[23.0,14.745,3.08,6.17,1.56,47.87,14.0]            |2.809137621850571 |
|[13.0,25.0,3.82,5.97,1.94,65.45,11.0]              |3.15551751036565  |
|[36.0,16.852,9.52,5.41,3.83,17.7,7.0]              |2.9253529324042784|
|[14.0,33.0,4.9,5.6,2.45,67.35,10.0]                |3.25449597810897  |
|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0] |4.252131876502602 |
|[14.0,30.276999999999997,6.88,5.93,3.44,44.01,2.0] |3.488648237823999 |
|[40.0,28.0,11.5,6.74,4.0,24.35,4.0]                |3.322098043817242 |
|[23.0,25.0,7.76,6.22,3.86,32.22,5.0]              

### Stopping the created spark session

In [46]:
spark.stop()

## Thank You!