In [3]:
!pip3 install koalas

Collecting koalas
  Downloading koalas-1.8.2-py3-none-any.whl (390 kB)
[?25l[K     |▉                               | 10 kB 20.5 MB/s eta 0:00:01[K     |█▊                              | 20 kB 9.0 MB/s eta 0:00:01[K     |██▌                             | 30 kB 7.8 MB/s eta 0:00:01[K     |███▍                            | 40 kB 3.6 MB/s eta 0:00:01[K     |████▏                           | 51 kB 3.6 MB/s eta 0:00:01[K     |█████                           | 61 kB 4.3 MB/s eta 0:00:01[K     |█████▉                          | 71 kB 4.6 MB/s eta 0:00:01[K     |██████▊                         | 81 kB 4.8 MB/s eta 0:00:01[K     |███████▌                        | 92 kB 5.3 MB/s eta 0:00:01[K     |████████▍                       | 102 kB 4.3 MB/s eta 0:00:01[K     |█████████▎                      | 112 kB 4.3 MB/s eta 0:00:01[K     |██████████                      | 122 kB 4.3 MB/s eta 0:00:01[K     |███████████                     | 133 kB 4.3 MB/s eta 0:00:01[K    

In [4]:
!pip install pyarrow==0.15.1

Collecting pyarrow==0.15.1
  Downloading pyarrow-0.15.1-cp37-cp37m-manylinux2010_x86_64.whl (59.2 MB)
[K     |████████████████████████████████| 59.2 MB 1.2 MB/s 
Installing collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 6.0.1
    Uninstalling pyarrow-6.0.1:
      Successfully uninstalled pyarrow-6.0.1
Successfully installed pyarrow-0.15.1


In [5]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz
!tar xf spark-3.0.3-bin-hadoop3.2.tgz
!pip install -q findspark

In [6]:
!ls /usr/lib/jvm/

default-java		   java-11-openjdk-amd64     java-8-openjdk-amd64
java-1.11.0-openjdk-amd64  java-1.8.0-openjdk-amd64


In [7]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.3-bin-hadoop3.2"

In [8]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [9]:
import sys
from pyspark.sql.functions import *

In [10]:
data=spark.read.csv("/content/cruise_ship_info.csv",inferSchema=True,header=True)

In [11]:
data.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|
|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|
|    Fantasy|   Carnival| 23| 

In [12]:
data.printSchema()

root
 |-- Ship_name: string (nullable = true)
 |-- Cruise_line: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tonnage: double (nullable = true)
 |-- passengers: double (nullable = true)
 |-- length: double (nullable = true)
 |-- cabins: double (nullable = true)
 |-- passenger_density: double (nullable = true)
 |-- crew: double (nullable = true)



In [13]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [14]:
feat=VectorAssembler(inputCols=["Age","Tonnage","passengers","length","cabins","passenger_density"],outputCol="independent")

In [15]:
output=feat.transform(data)

In [16]:
output.show()

+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|         independent|
+-----------+-----------+---+------------------+----------+------+------+-----------------+----+--------------------+
|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|[6.0,30.276999999...|
|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|[26.0,47.262,14.8...|
|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|[11.0,110.0,29.74...|
|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|[17.0,101.353,26....|
|    Ecstasy|   Carnival| 22|            70.367|     20.

In [17]:
output.select("independent").show()

+--------------------+
|         independent|
+--------------------+
|[6.0,30.276999999...|
|[6.0,30.276999999...|
|[26.0,47.262,14.8...|
|[11.0,110.0,29.74...|
|[17.0,101.353,26....|
|[22.0,70.367,20.5...|
|[15.0,70.367,20.5...|
|[23.0,70.367,20.5...|
|[19.0,70.367,20.5...|
|[6.0,110.23899999...|
|[10.0,110.0,29.74...|
|[28.0,46.052,14.5...|
|[18.0,70.367,20.5...|
|[17.0,70.367,20.5...|
|[11.0,86.0,21.24,...|
|[8.0,110.0,29.74,...|
|[9.0,88.5,21.24,9...|
|[15.0,70.367,20.5...|
|[12.0,88.5,21.24,...|
|[20.0,70.367,20.5...|
+--------------------+
only showing top 20 rows



In [18]:
output.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew',
 'independent']

In [19]:
final_data=output.select("independent","crew")

In [20]:
final_data.show()

+--------------------+----+
|         independent|crew|
+--------------------+----+
|[6.0,30.276999999...|3.55|
|[6.0,30.276999999...|3.55|
|[26.0,47.262,14.8...| 6.7|
|[11.0,110.0,29.74...|19.1|
|[17.0,101.353,26....|10.0|
|[22.0,70.367,20.5...| 9.2|
|[15.0,70.367,20.5...| 9.2|
|[23.0,70.367,20.5...| 9.2|
|[19.0,70.367,20.5...| 9.2|
|[6.0,110.23899999...|11.5|
|[10.0,110.0,29.74...|11.6|
|[28.0,46.052,14.5...| 6.6|
|[18.0,70.367,20.5...| 9.2|
|[17.0,70.367,20.5...| 9.2|
|[11.0,86.0,21.24,...| 9.3|
|[8.0,110.0,29.74,...|11.6|
|[9.0,88.5,21.24,9...|10.3|
|[15.0,70.367,20.5...| 9.2|
|[12.0,88.5,21.24,...| 9.3|
|[20.0,70.367,20.5...| 9.2|
+--------------------+----+
only showing top 20 rows



In [21]:
train,test=final_data.randomSplit([0.80,0.20])

In [22]:
train.show()

+--------------------+-----+
|         independent| crew|
+--------------------+-----+
|[4.0,220.0,54.0,1...| 21.0|
|[5.0,86.0,21.04,9...|  8.0|
|[5.0,122.0,28.5,1...|  6.7|
|[5.0,133.5,39.59,...|13.13|
|[5.0,160.0,36.34,...| 13.6|
|[6.0,30.276999999...| 3.55|
|[6.0,90.0,20.0,9....|  9.0|
|[6.0,110.23899999...| 11.5|
|[6.0,112.0,38.0,9...| 10.9|
|[6.0,113.0,37.82,...| 12.0|
|[6.0,158.0,43.7,1...| 13.6|
|[7.0,89.6,25.5,9....| 9.87|
|[7.0,116.0,31.0,9...| 12.0|
|[7.0,158.0,43.7,1...| 13.6|
|[8.0,91.0,22.44,9...| 11.0|
|[8.0,110.0,29.74,...| 11.6|
|[9.0,90.09,25.01,...| 8.69|
|[9.0,105.0,27.2,8...|10.68|
|[9.0,110.0,29.74,...| 11.6|
|[9.0,113.0,26.74,...|12.38|
+--------------------+-----+
only showing top 20 rows



In [23]:
test.show()

+--------------------+-----+
|         independent| crew|
+--------------------+-----+
|[5.0,115.0,35.74,...| 12.2|
|[6.0,30.276999999...| 3.55|
|[6.0,93.0,23.94,9...|11.09|
|[8.0,77.499,19.5,...|  9.0|
|[9.0,59.058,17.0,...|  7.4|
|[9.0,81.0,21.44,9...| 10.0|
|[9.0,85.0,19.68,9...| 8.69|
|[9.0,88.5,21.24,9...| 10.3|
|[9.0,113.0,26.74,...|12.38|
|[10.0,68.0,10.8,7...| 6.36|
|[10.0,86.0,21.14,...|  9.2|
|[10.0,110.0,29.74...| 11.6|
|[11.0,91.62700000...|  9.0|
|[11.0,110.0,29.74...| 19.1|
|[12.0,91.0,20.32,...| 9.99|
|[12.0,138.0,31.14...|11.85|
|[13.0,91.0,20.32,...| 9.99|
|[14.0,33.0,4.9,5....| 3.24|
|[15.0,75.33800000...| 13.0|
|[16.0,78.491,24.3...| 7.65|
+--------------------+-----+
only showing top 20 rows



In [24]:
from pyspark.ml.regression import LinearRegression

In [25]:
reg=LinearRegression(featuresCol="independent",labelCol="crew")

In [26]:
reg=reg.fit(train)

In [27]:
reg.coefficients

DenseVector([-0.0066, 0.0165, -0.1248, 0.4187, 0.7043, -0.002])

In [28]:
reg.intercept

-0.596919059520528

In [29]:
train_sum=reg.summary

In [31]:
train_sum.rootMeanSquaredError

0.7135342183789438

In [32]:
train_sum.r2

0.9575128670646698

In [33]:
train_sum.r2adj

0.9553340397346528

In [34]:
pred=reg.evaluate(test)

In [35]:
pred.predictions.show()

+--------------------+-----+------------------+
|         independent| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,115.0,35.74,...| 12.2|11.302259546238174|
|[6.0,30.276999999...| 3.55|3.8996148147663403|
|[6.0,93.0,23.94,9...|11.09|10.304748532385274|
|[8.0,77.499,19.5,...|  9.0| 8.567961770023214|
|[9.0,59.058,17.0,...|  7.4|7.3089323531692605|
|[9.0,81.0,21.44,9...| 10.0| 9.336203773345696|
|[9.0,85.0,19.68,9...| 8.69| 9.050075982191895|
|[9.0,88.5,21.24,9...| 10.3| 9.582766277505204|
|[9.0,113.0,26.74,...|12.38|11.186337538070083|
|[10.0,68.0,10.8,7...| 6.36| 6.168039948749584|
|[10.0,86.0,21.14,...|  9.2| 9.494460279825315|
|[10.0,110.0,29.74...| 11.6|11.822556181890292|
|[11.0,91.62700000...|  9.0| 9.274925193221536|
|[11.0,110.0,29.74...| 19.1| 11.83133647891337|
|[12.0,91.0,20.32,...| 9.99| 9.108452448592697|
|[12.0,138.0,31.14...|11.85|12.864436757014532|
|[13.0,91.0,20.32,...| 9.99|  9.10181568859402|
|[14.0,33.0,4.9,5....| 3.24|3.1799384249

In [36]:
from pyspark.ml.regression import RandomForestRegressor

In [37]:
rf=RandomForestRegressor(featuresCol="independent",labelCol="crew")

In [38]:
rfmodel=rf.fit(train)

In [39]:
predictions=rfmodel.transform(test)

In [41]:
from pyspark.ml.evaluation import RegressionEvaluator # Evaluate Regression model

In [42]:
evaluator=RegressionEvaluator(labelCol="crew",predictionCol="prediction",metricName="r2")
r2=evaluator.evaluate(predictions)
print("r2 on test data=%g"%r2)

r2 on test data=0.807392


In [48]:
from pyspark.ml.regression import DecisionTreeRegressor

In [49]:
dt = DecisionTreeRegressor(featuresCol="independent",labelCol="crew")

In [50]:
dt=dt.fit(train)

In [51]:
dtpred = dt.transform(test)

In [52]:
dtpred.show()

+--------------------+-----+------------------+
|         independent| crew|        prediction|
+--------------------+-----+------------------+
|[5.0,115.0,35.74,...| 12.2|11.619999999999997|
|[6.0,30.276999999...| 3.55|3.9511111111111106|
|[6.0,93.0,23.94,9...|11.09|             11.25|
|[8.0,77.499,19.5,...|  9.0| 9.067222222222222|
|[9.0,59.058,17.0,...|  7.4| 6.536666666666666|
|[9.0,81.0,21.44,9...| 10.0| 9.067222222222222|
|[9.0,85.0,19.68,9...| 8.69| 9.067222222222222|
|[9.0,88.5,21.24,9...| 10.3| 9.067222222222222|
|[9.0,113.0,26.74,...|12.38|11.619999999999997|
|[10.0,68.0,10.8,7...| 6.36|              5.45|
|[10.0,86.0,21.14,...|  9.2| 9.067222222222222|
|[10.0,110.0,29.74...| 11.6|11.619999999999997|
|[11.0,91.62700000...|  9.0| 9.990000000000002|
|[11.0,110.0,29.74...| 19.1|11.619999999999997|
|[12.0,91.0,20.32,...| 9.99| 9.990000000000002|
|[12.0,138.0,31.14...|11.85|11.619999999999997|
|[13.0,91.0,20.32,...| 9.99| 9.990000000000002|
|[14.0,33.0,4.9,5....| 3.24|            

In [53]:
dtpred = dt.transform(train)

In [54]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(dtpred)
print ("Root Mean Squared Error (RMSE) on train data = %g" % rmse)

Root Mean Squared Error (RMSE) on train data = 0.512577


In [55]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(dtpred)
print ("Root Mean Squared Error (RMSE) on train data = %g" % r2)

Root Mean Squared Error (RMSE) on train data = 0.978075


In [56]:
evaluator = RegressionEvaluator(
    labelCol="crew", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(dtpred)
print ("r2 on test data = %g" % r2)

r2 on test data = 0.978075
