# Installing and Setting Spark on Google Colab

In [3]:
# Mounting folder drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# downloading and installing jvm and spark-3.1.1
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#!wget -q https://www.apache.org/dyn/closer.lua/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!wget -q https://www-us.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!gunzip -c spark-3.1.1-bin-hadoop2.7.tgz | tar xvf -

In [5]:
# checks
import os

os.getcwd()

#os.listdir('/content/drive')
os.listdir()

os.listdir("/content/spark-3.1.1-bin-hadoop2.7")

['README.md',
 'sbin',
 'kubernetes',
 'jars',
 'licenses',
 'data',
 'RELEASE',
 'yarn',
 'conf',
 'python',
 'LICENSE',
 'bin',
 'R',
 'examples',
 'NOTICE']

In [6]:
# setting environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"


In [7]:
# installing spark necessary modules
!pip install findspark
!pip install pyspark

Collecting findspark
  Downloading https://files.pythonhosted.org/packages/fc/2d/2e39f9a023479ea798eed4351cd66f163ce61e00c717e03c37109f00c0f2/findspark-1.4.2-py2.py3-none-any.whl
Installing collected packages: findspark
Successfully installed findspark-1.4.2
Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 12.6MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=eb39f7112f55726fd32f4fec84f73dc4ea301f

In [8]:
# checking location where Spark is installed
import findspark
findspark.find()

'/content/spark-3.1.1-bin-hadoop2.7'

In [10]:
# Creanting a spark session and reading a file as example

findspark.init()
from pyspark.sql import SparkSession
from datetime import datetime, timedelta


spark = SparkSession.builder.master("local")\
        .appName("Colab")\
        .config('spark.ui.port', '4050')\
        .getOrCreate()

dt_ini = datetime.now()

df_sp = spark.read.csv('/content/drive/MyDrive/orders.csv', header=True)

dt_end = datetime.now()
print(dt_end - dt_ini)


0:00:07.584366


In [11]:
# checking csv dataframe
df_sp.head(10)
#df_sp.printSchema()


[Row(_c0='0', order_creation_time='2020-02-26 13:29:00', order_id='102523405', device_id=None),
 Row(_c0='1', order_creation_time='2020-02-26 21:13:24', order_id='102641477', device_id=None),
 Row(_c0='2', order_creation_time='2020-02-26 17:04:33', order_id='102563229', device_id=None),
 Row(_c0='3', order_creation_time='2020-02-26 20:07:45', order_id='102622016', device_id=None),
 Row(_c0='4', order_creation_time='2020-02-26 21:19:25', order_id='102643455', device_id=None),
 Row(_c0='5', order_creation_time='2020-02-26 00:08:19', order_id='102452116', device_id='d0bc996f-72d2-4ec2-8f40-d82d81120862'),
 Row(_c0='6', order_creation_time='2020-02-26 10:34:36', order_id='102492697', device_id='2bb11f99-ab21-4628-abe6-b919da8fbf34'),
 Row(_c0='7', order_creation_time='2020-02-26 11:32:47', order_id='102500373', device_id='2aec0e20-e1d8-4323-9b12-f066856488a7'),
 Row(_c0='8', order_creation_time='2020-02-26 11:46:17', order_id='102503173', device_id='37638585-a181-4265-aeb2-d9e284bb30c3'),


In [12]:
# Creating spark context objetc
sc=spark.sparkContext
sc

In [13]:
data = range(1,1001)
rdd=sc.parallelize(data)
print("Number of Partitions: "+str(rdd.getNumPartitions()))
print("Action: First element: "+str(rdd.take(10)))

Number of Partitions: 1
Action: First element: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [14]:
path = '/content/drive/MyDrive/orders.csv'
data = sc.textFile(path) # use the sc context to read in a text file
print(data.count())
data.take(20)


2358


[',order_creation_time,order_id,device_id',
 '0,2020-02-26 13:29:00,102523405,',
 '1,2020-02-26 21:13:24,102641477,',
 '2,2020-02-26 17:04:33,102563229,',
 '3,2020-02-26 20:07:45,102622016,',
 '4,2020-02-26 21:19:25,102643455,',
 '5,2020-02-26 00:08:19,102452116,d0bc996f-72d2-4ec2-8f40-d82d81120862',
 '6,2020-02-26 10:34:36,102492697,2bb11f99-ab21-4628-abe6-b919da8fbf34',
 '7,2020-02-26 11:32:47,102500373,2aec0e20-e1d8-4323-9b12-f066856488a7',
 '8,2020-02-26 11:46:17,102503173,37638585-a181-4265-aeb2-d9e284bb30c3',
 '9,2020-02-26 11:38:40,102501909,53d16c33-5980-4ad8-9032-f85d686d2855',
 '10,2020-02-26 11:57:53,102504038,3c533ccb-8739-49c8-bfee-cf467898fe9d',
 '11,2020-02-26 12:26:11,102510286,82fb1d95-c894-4499-825e-8774d0247087',
 '12,2020-02-26 12:46:31,102514893,7995c021-8983-4f53-902f-2a089944c501',
 '13,2020-02-26 12:52:04,102515379,1cdc8ef5-3752-4200-8b84-3a1df38b195f',
 '14,2020-02-26 13:40:44,102525697,e7dd9fe8-03e8-41ae-abc7-8a4b0b098971',
 '15,2020-02-26 14:03:08,102529774,4

In [15]:
create_table_sql = spark.sql("CREATE TABLE IF NOT EXISTS orders_table \
                      USING CSV \
                      OPTIONS (path \"/content/drive/MyDrive/orders.csv\", header \"true\", inferSchema \"true\");\
                      ")

select_table = spark.sql("select * from orders_table")

In [16]:
select_table.show()

+---+-------------------+---------+--------------------+
|_c0|order_creation_time| order_id|           device_id|
+---+-------------------+---------+--------------------+
|  0|2020-02-26 13:29:00|102523405|                null|
|  1|2020-02-26 21:13:24|102641477|                null|
|  2|2020-02-26 17:04:33|102563229|                null|
|  3|2020-02-26 20:07:45|102622016|                null|
|  4|2020-02-26 21:19:25|102643455|                null|
|  5|2020-02-26 00:08:19|102452116|d0bc996f-72d2-4ec...|
|  6|2020-02-26 10:34:36|102492697|2bb11f99-ab21-462...|
|  7|2020-02-26 11:32:47|102500373|2aec0e20-e1d8-432...|
|  8|2020-02-26 11:46:17|102503173|37638585-a181-426...|
|  9|2020-02-26 11:38:40|102501909|53d16c33-5980-4ad...|
| 10|2020-02-26 11:57:53|102504038|3c533ccb-8739-49c...|
| 11|2020-02-26 12:26:11|102510286|82fb1d95-c894-449...|
| 12|2020-02-26 12:46:31|102514893|7995c021-8983-4f5...|
| 13|2020-02-26 12:52:04|102515379|1cdc8ef5-3752-420...|
| 14|2020-02-26 13:40:44|102525

In [17]:
import numpy as np

np.array([True, 1, 2]) + np.array([3, 4, False])


array([4, 5, 2])

In [None]:
spark.stop()