In [None]:
# Setup Spark
# ===============
# Installing Spark needs to be done once each time you re-open this notebook. It should take about 10 seconds.
# ===============
# install java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://dlcdn.apache.org/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz

# unzip the spark file to the current folder
!tar xf spark-3.3.1-bin-hadoop3.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop3"

# install findspark using pip
!pip install -q findspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data

--2022-11-10 13:17:38--  https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32218 (31K) [application/x-httpd-php]
Saving to: ‘crx.data’


2022-11-10 13:17:39 (203 KB/s) - ‘crx.data’ saved [32218/32218]



In [None]:
dat = spark.read.csv('crx.data',inferSchema=True, header=False)

In [None]:
dat.rdd

AttributeError: ignored

In [None]:
dat = dat.repartition(4)

In [None]:
dat.select('*')

DataFrame[_c0: string, _c1: string, _c2: double, _c3: string, _c4: string, _c5: string, _c6: string, _c7: double, _c8: string, _c9: string, _c10: int, _c11: string, _c12: string, _c13: string, _c14: int, _c15: string]

In [None]:
dat.select("*").toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15
0,b,18.67,5.000,u,g,q,v,0.375,t,t,2,f,g,00000,38,-
1,b,38.42,0.705,u,g,c,v,0.375,f,t,2,f,g,00225,500,-
2,a,37.33,2.500,u,g,i,h,0.210,f,f,0,f,g,00260,246,-
3,b,38.92,1.750,u,g,k,v,0.500,f,f,0,t,g,00300,2,-
4,b,25.67,2.210,y,p,aa,v,4.000,t,f,0,f,g,00188,0,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,34.83,4.000,u,g,d,bb,12.500,t,f,0,t,g,?,0,-
686,b,?,10.500,u,g,x,v,6.500,t,f,0,f,g,00000,0,+
687,a,41.17,6.500,u,g,q,v,0.500,t,t,3,t,g,00145,0,+
688,b,43.25,3.000,u,g,q,h,6.000,t,t,11,f,g,00080,0,+


In [None]:
dat.filter(dat['_c0'] == "a").toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15
0,a,37.33,2.500,u,g,i,h,0.210,f,f,0,f,g,00260,246,-
1,a,45.00,4.585,u,g,k,h,1.000,f,f,0,t,s,00240,0,-
2,a,18.92,9.250,y,p,c,v,1.000,t,t,4,t,g,00080,500,+
3,a,22.58,10.750,u,g,q,v,0.415,t,t,5,t,g,00000,560,+
4,a,36.00,1.000,u,g,c,v,2.000,t,t,11,f,g,00000,456,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,a,18.25,10.000,u,g,w,v,1.000,f,t,1,f,g,00120,1,-
206,a,21.25,2.335,u,g,i,bb,0.500,t,t,4,f,s,00080,0,+
207,a,23.25,5.875,u,g,q,v,3.170,t,t,10,f,g,00120,245,+
208,a,52.83,15.000,u,g,c,v,5.500,t,t,14,f,g,00000,2200,+


In [None]:
from pyspark.sql.functions import *

In [None]:
# when increasing the number of partitions, you have to use repartition() (which incurs shuffle)
dat.rdd.repartition(4).getNumPartitions()

4

In [None]:
# when decreasing the number of partitions, you can use coalesce() (which combines partitions while minimizing shuffles)
dat.rdd.coalesce(2).getNumPartitions()

2

In [None]:
 dat.where(col('_c1') == 30.83).show()

+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|_c0|  _c1|_c2|_c3|_c4|_c5|_c6| _c7|_c8|_c9|_c10|_c11|_c12| _c13|_c14|_c15|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|  b|30.83|0.0|  u|  g|  w|  v|1.25|  t|  t|   1|   f|   g|00202|   0|   +|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+



In [None]:
 col('_c1') == 30.83

Column<'(_c1 = 30.83)'>

In [None]:
dat.where('_c1 = 30.83').show()

+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|_c0|  _c1|_c2|_c3|_c4|_c5|_c6| _c7|_c8|_c9|_c10|_c11|_c12| _c13|_c14|_c15|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|  b|30.83|0.0|  u|  g|  w|  v|1.25|  t|  t|   1|   f|   g|00202|   0|   +|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+



In [None]:
dat.createOrReplaceTempView('datview')
spark.sql('SELECT * FROM datview WHERE _c1 = 30.83').show()

+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|_c0|  _c1|_c2|_c3|_c4|_c5|_c6| _c7|_c8|_c9|_c10|_c11|_c12| _c13|_c14|_c15|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+
|  b|30.83|0.0|  u|  g|  w|  v|1.25|  t|  t|   1|   f|   g|00202|   0|   +|
+---+-----+---+---+---+---+---+----+---+---+----+----+----+-----+----+----+



In [None]:
spark.sql('SELECT max(_c2) FROM datview WHERE _c1 = 30.83 GROUP BY _c3').explain(True)

== Parsed Logical Plan ==
'Aggregate ['_c3], [unresolvedalias('max('_c2), None)]
+- 'Filter ('_c1 = 30.83)
   +- 'UnresolvedRelation [datview], [], false

== Analyzed Logical Plan ==
max(_c2): double
Aggregate [_c3#446], [max(_c2#445) AS max(_c2)#816]
+- Filter (cast(_c1#444 as double) = cast(30.83 as double))
   +- SubqueryAlias datview
      +- View (`datview`, [_c0#443,_c1#444,_c2#445,_c3#446,_c4#447,_c5#448,_c6#449,_c7#450,_c8#451,_c9#452,_c10#453,_c11#454,_c12#455,_c13#456,_c14#457,_c15#458])
         +- Repartition 4, true
            +- Relation [_c0#443,_c1#444,_c2#445,_c3#446,_c4#447,_c5#448,_c6#449,_c7#450,_c8#451,_c9#452,_c10#453,_c11#454,_c12#455,_c13#456,_c14#457,_c15#458] csv

== Optimized Logical Plan ==
Aggregate [_c3#446], [max(_c2#445) AS max(_c2)#816]
+- Repartition 4, true
   +- Project [_c2#445, _c3#446]
      +- Filter (isnotnull(_c1#444) AND (cast(_c1#444 as double) = 30.83))
         +- Relation [_c0#443,_c1#444,_c2#445,_c3#446,_c4#447,_c5#448,_c6#449,_c7#450,_c

In [None]:
 dat.where(col('_c1') == 30.83).explain()
spark.sql('SELECT * FROM datview WHERE _c1 = 30.83').explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(4), REPARTITION_BY_NUM, [id=#530]
   +- Filter (isnotnull(_c1#444) AND (cast(_c1#444 as double) = 30.83))
      +- FileScan csv [_c0#443,_c1#444,_c2#445,_c3#446,_c4#447,_c5#448,_c6#449,_c7#450,_c8#451,_c9#452,_c10#453,_c11#454,_c12#455,_c13#456,_c14#457,_c15#458] Batched: false, DataFilters: [isnotnull(_c1#444), (cast(_c1#444 as double) = 30.83)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/content/crx.data], PartitionFilters: [], PushedFilters: [IsNotNull(_c1)], ReadSchema: struct<_c0:string,_c1:string,_c2:double,_c3:string,_c4:string,_c5:string,_c6:string,_c7:double,_c...


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Exchange RoundRobinPartitioning(4), REPARTITION_BY_NUM, [id=#543]
   +- Filter (isnotnull(_c1#444) AND (cast(_c1#444 as double) = 30.83))
      +- FileScan csv [_c0#443,_c1#444,_c2#445,_c3#446,_c4#447,_c5#448,_c6#449,_c7#450,_c8#451,_c9#452,_c10#453,_c11#