# Linear Regression Consulting Project

    Description: Measurements of ship size, capacity, crew, and age for 158 cruise
    ships.


    Variables/Columns
    Ship Name     1-20
    Cruise Line   21-40
    Age (as of 2013)   46-48
    Tonnage (1000s of tons)   50-56
    passengers (100s)   58-64
    Length (100s of feet)  66-72
    Cabins  (100s)   74-80
    Passenger Density   82-88
    Crew  (100s)   90-96


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=728387c50b42bd33047931b88fddcb9f5ebaf195b9cd2dd4a0354d7522c2c358
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [4]:
from pyspark.sql.session import SparkSession

In [6]:
spark = SparkSession.builder.appName('consult_project').getOrCreate()

In [7]:
from pyspark.ml.regression import LinearRegression

In [98]:
data = spark.read.csv('/content/cruise_ship_info.csv', header = True, inferSchema = True)

data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [99]:
## To look at the structure of our DataFrame

data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [106]:
# We can see how many cruise lines we have
# And we have sorted the cruise lines by their count in our dataset in descending order
data.groupBy('Cruise_line').count().orderBy('count', ascending = False).show()

+-----------------+-----+
|      Cruise_line|count|
+-----------------+-----+
|  Royal_Caribbean|   23|
|         Carnival|   22|
|         Princess|   17|
| Holland_American|   14|
|        Norwegian|   13|
|            Costa|   11|
|        Celebrity|   10|
|              MSC|    8|
|              P&O|    6|
|             Star|    6|
|Regent_Seven_Seas|    5|
|        Silversea|    4|
|           Cunard|    3|
|         Seabourn|    3|
|         Windstar|    3|
|          Oceania|    3|
|          Crystal|    2|
|           Disney|    2|
|          Azamara|    2|
|           Orient|    1|
+-----------------+-----+



In [58]:
from pyspark.ml.feature import StringIndexer

In [108]:
## StringIndexer Initialization
## We have used StringIndexer to convert the string values of Cruise Line column into numbers to use as input for our model
stringindexer = StringIndexer(inputCol = 'Cruise_line', outputCol = 'Cruise_line_indexed')
indexerModel = stringindexer.fit(data)

## Transform the DataFrame using the fitted StringIndexer model
indexed_data = indexerModel.transform(data)

indexed_data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|Cruise_line_indexed|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+-------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|               16.0|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|                1.0|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|                1.0|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|                1.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.5

In [64]:
data = indexed_data.drop('Cruise_line')

In [70]:
data.columns

['Cruise_line_indexed',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [67]:
data = data.select(['Cruise_line_indexed','Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density', 'crew'])

In [69]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [73]:
## Vectorizing the features of the data, and stroing it in the features column

assembler = VectorAssembler(inputCols = ['Cruise_line_indexed', 'Age', 'Tonnage', 'passengers', 'length','cabins',
                                         'passenger_density'],
                            outputCol = 'features')

final_data = assembler.transform(data)

In [74]:
final_data.printSchema()

root
 |-- Cruise_line_indexed: double (nullable = false)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)
 |-- features: vector (nullable = true)



In [77]:
## Our new DataFrame that consists of features and our target column

final_data = final_data.select(['features', 'crew'])

In [120]:
# Train test split

train_data, test_data = final_data.randomSplit([0.8, 0.2])

In [121]:
## Fitting the train data into our model

lr = LinearRegression(featuresCol = 'features',
                      labelCol = 'crew')
lr_model = lr.fit(train_data)

In [122]:
## To easily evaluate the correctness of our predictions

test_results = lr_model.evaluate(test_data)

In [123]:
train_data.describe().show()

+-------+------------------+
|summary|              crew|
+-------+------------------+
|  count|               129|
|   mean| 7.740775193798456|
| stddev|3.2904270743901667|
|    min|              0.59|
|    max|              19.1|
+-------+------------------+



### Looking at the metrics and performance of our Linear Regression model

In [127]:
print(f'Root Mean Squared Error is: {test_results.rootMeanSquaredError}')
print(f'R2 score is: {test_results.r2}')
print(f'Mean Squared Error is: {test_results.meanSquaredError}')
print(f'Mean Absolute Error is: {test_results.meanAbsoluteError}')

Root Mean Squared Error is: 0.6943137945993951
R2 score is: 0.974074230079063
Mean Squared Error is: 0.48207164537101105
Mean Absolute Error is: 0.5247238820434306


In [128]:
## The difference between the predicted values and the actual label values

test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
| 0.09329990313355552|
| -1.5152401758803844|
|-0.32742917621478185|
| -0.2544301294111708|
| -0.4291002152505392|
|-0.30312356813270114|
|  0.7415240799863483|
| -1.1675075314339765|
|  0.9959407238707776|
|0.047703039884721576|
| -1.3625177244248103|
|0.050477878675664556|
| 0.06595382962433938|
| -0.6076144903573439|
|   0.734349609990046|
| -1.2844364425519936|
| -1.2689604916033188|
|-0.09255386098968099|
|  0.3966507768456591|
| -0.8335280995168368|
+--------------------+
only showing top 20 rows



In [131]:
## To check if there is any correlation between the columns that can affect the evaluation of the model

from pyspark.sql.functions import corr

data.select(corr('crew', 'cabins')).show()

# This means that a lot of these features of the ship itself
# have a really good indication of exactly how many crew members we need

+------------------+
|corr(crew, cabins)|
+------------------+
|0.9508226063578497|
+------------------+



In [126]:
## Displaying the predictions of the model with the features of the final_data

unlabeled_data = final_data.select('features')

predictions = lr_model.transform(unlabeled_data)

predictions.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[16.0,6.0,30.2769...| 4.424684760400513|
|[16.0,6.0,30.2769...| 4.424684760400513|
|[1.0,26.0,47.262,...| 6.251233440389353|
|[1.0,11.0,110.0,2...|12.171519943598192|
|[1.0,17.0,101.353...|10.788431609632536|
|[1.0,22.0,70.367,...| 8.612898195770308|
|[1.0,15.0,70.367,...|  8.72122985241103|
|[1.0,23.0,70.367,...|  8.60904696259588|
|[1.0,19.0,70.367,...|  8.65932604861633|
|[1.0,6.0,110.2389...|11.177389337872441|
|[1.0,10.0,110.0,2...| 12.16947607795056|
|[1.0,28.0,46.052,...| 6.137471047165583|
|[1.0,18.0,70.367,...| 8.674801999565005|
|[1.0,17.0,70.367,...|  8.69027795051368|
|[1.0,11.0,86.0,21...| 9.603123568132702|
|[1.0,8.0,110.0,29...|12.200427979847909|
|[1.0,9.0,88.5,21....| 9.646924430109882|
|[1.0,15.0,70.367,...|  8.72122985241103|
|[1.0,12.0,88.5,21...|10.467507531433977|
|[1.0,20.0,70.367,...| 8.643850097667656|
+--------------------+------------