__Prediction of number of crew members needed for the future ships builded by the Ship manufacturing company__

In [46]:
# Initialize pyspark
import findspark
findspark.init()
import pyspark

In [47]:
# Initialize and create ba spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Cruise_Ship').getOrCreate()

In [48]:
# Import statements to setup ML
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.linalg import Vectors

In [49]:
# Using Spark to read in the Ecommerce Customers csv file
data = spark.read.csv('cruise_ship_info.csv', header=True, inferSchema=True)

In [50]:
# Printing the first row of the dataframe
data.head()

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55)

In [51]:
# Printing the schema of the dataframe
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



__Filtering the string columns and converting the dataframe to ML acceptable format
             --->    i.e., ("label","features")__

__Dealing with String and Categorical Columns__

#Ship Name is a useless arbitrary string, but the cruise_line itself may be useful. So making it into a categorical variable!

In [52]:
data.groupBy('Cruise_line').count().show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|            Costa|   11|
|              P&O|    6|
|           Cunard|    3|
|Regent_Seven_Seas|    5|
|              MSC|    8|
|         Carnival|   22|
|          Crystal|    2|
|           Orient|    1|
|         Princess|   17|
|        Silversea|    4|
|         Seabourn|    3|
| Holland_American|   14|
|         Windstar|    3|
|           Disney|    2|
|        Norwegian|   13|
|          Oceania|    3|
|          Azamara|    2|
|        Celebrity|   10|
|             Star|    6|
|  Royal_Caribbean|   23|
+-----------------+-----+



In [53]:
# Converting 'Cruise_line' column from string to a categorical numerical value using String Indexer 
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_line_Cat')
indexed_model = indexer.fit(data)
filtered_data = indexed_model.transform(data)

In [54]:
filtered_data.head()

Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, Cruise_line_Cat=16.0)

In [55]:
filtered_data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'Cruise_line_Cat']

In [56]:
assembler = VectorAssembler(inputCols=['Age','Tonnage','passengers','length','cabins',
                                         'passenger_density','Cruise_line_Cat'], outputCol='features')

In [57]:
output = assembler.transform(filtered_data)

In [58]:
final_data = output.select('crew','features')

In [59]:
final_data.show(3, truncate=False)

+----+--------------------------------------------------+
|crew|features                                          |
+----+--------------------------------------------------+
|3.55|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|
|3.55|[6.0,30.276999999999997,6.94,5.94,3.55,42.64,16.0]|
|6.7 |[26.0,47.262,14.86,7.22,7.43,31.8,1.0]            |
+----+--------------------------------------------------+
only showing top 3 rows



#Splitting the resultane data into training data and testing data
#Training data is to train the model
#Testing data is to test the builted model

In [60]:
#Splitting the total data to 70% and 30% for training data and testing data respectively
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [61]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               105|
|   mean| 7.995809523809522|
| stddev|3.5537539147283246|
|    min|              0.59|
|    max|              21.0|
+-------+------------------+



In [62]:
test_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|                53|
|   mean| 7.394716981132077|
| stddev|3.3994465634262747|
|    min|              0.59|
|    max|              13.6|
+-------+------------------+



In [63]:
#Creating a linear regression model object
lr = LinearRegression(labelCol='crew')

In [64]:
# Creating a linear regression model and fitting the training data to it
lrModel = lr.fit(train_data)

__Getting the training summary of the created model__

In [65]:
training_summary = lrModel.summary

In [66]:
training_summary.residuals.show(3)

+--------------------+
|           residuals|
+--------------------+
|   0.318789155630844|
|-0.09447659838001365|
| -0.8139626752580021|
+--------------------+
only showing top 3 rows



In [67]:
print("Mean Absolute Error: ",training_summary.meanAbsoluteError)
print("Mean Squared Error: ",training_summary.meanSquaredError)
print("Root Mean Squared Error: ",training_summary.rootMeanSquaredError)
print("R Squared Error: ",training_summary.r2)

Mean Absolute Error:  0.66974095706352
Mean Squared Error:  1.1450566377341058
Root Mean Squared Error:  1.0700731926995022
R Squared Error:  0.9084605652845587


In [68]:
# Evaluating the model against test data
test_results = lrModel.evaluate(test_data)

In [69]:
# Getting the co-effecients and intercept
print('Coeffecients: {}, Intercept: {}'.format(lrModel.coefficients,lrModel.intercept))

Coeffecients: [-0.008234192619860093,0.011901430260234547,-0.16230132388734037,0.3805018120002111,0.8916250170617811,-0.013683750095525747,0.06152547425588149], Intercept: -0.6394544161702387


In [70]:
# Getting the residuals
test_results.residuals.show(3)

+-------------------+
|          residuals|
+-------------------+
| 0.3225941737508462|
|-0.8077915960422782|
|0.18778968740657942|
+-------------------+
only showing top 3 rows



In [71]:
# Evaluating the model by checking the different types of error

print("Mean Absolute Error: ",test_results.meanAbsoluteError)
print("Mean Squared Error: ",test_results.meanSquaredError)
print("Root Mean Squared Error: ",test_results.rootMeanSquaredError)
print("R Squared Error: ",test_results.r2)

Mean Absolute Error:  0.5043804667646195
Mean Squared Error:  0.4227015886844005
Root Mean Squared Error:  0.6501550497261407
R Squared Error:  0.9627187926557267


__Checking the correlation between crew members with other features__

In [72]:
from pyspark.sql.functions import corr

data.select(corr('crew','passengers')).show()

+----------------------+
|corr(crew, passengers)|
+----------------------+
|    0.9152341306065384|
+----------------------+



In [73]:
data.select(corr('crew','cabins')).show()

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



__Getting the predictions from the builted model without label column__

In [74]:
unlabelled_data = test_data.select('features')

In [75]:
predictions = lrModel.transform(unlabelled_data)

In [76]:
predictions.show(5)

+--------------------+-------------------+
|            features|         prediction|
+--------------------+-------------------+
|[22.0,3.341,0.66,...|0.26740582624915377|
|[27.0,5.35,1.67,4...| 1.6877915960422782|
|[27.0,12.5,3.94,4...| 1.2722103125934205|
|[19.0,16.8,2.96,5...| 2.0991029340430085|
|[16.0,19.2,3.2,5....|  2.110745786156217|
+--------------------+-------------------+
only showing top 5 rows



In [77]:
# Stopping the created spark session
spark.stop()