# Spark typical operations

Initializing Spark using pyspark.sql is so easy.

Remember Spark Session is the window to the Spark World

In [1]:
from pyspark.sql import SparkSession

sc = SparkSession.builder.appName('Hello').getOrCreate()

data_frame = sc.read.csv('creditcard.csv', header=True)
data_frame.select('V19', 'V2', 'V3').show(10)

+-------------------+-------------------+------------------+
|                V19|                 V2|                V3|
+-------------------+-------------------+------------------+
|  0.403992960255733|-0.0727811733098497|  2.53634673796914|
| -0.145783041325259|   0.26615071205963|  0.16648011335321|
|  -2.26185709530414|  -1.34016307473609|  1.77320934263119|
|   -1.2326219700892| -0.185226008082898|  1.79299333957872|
|  0.803486924960175|  0.877736754848451|    1.548717846511|
|-0.0331937877876282|  0.960523044882985|  1.14110934232219|
|-0.0455750446637976|  0.141003507049326|0.0453707735899449|
|  0.324504731321494|   1.41796354547385|   1.0743803763556|
|   0.57032816746536|  0.286157196276544|-0.113192212729871|
|  0.451772964394125|   1.11959337641566|  1.04436655157316|
+-------------------+-------------------+------------------+
only showing top 10 rows



Printing schema 

In [2]:
print(type(data_frame))
data_frame.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Time: string (nullable = true)
 |-- V1: string (nullable = true)
 |-- V2: string (nullable = true)
 |-- V3: string (nullable = true)
 |-- V4: string (nullable = true)
 |-- V5: string (nullable = true)
 |-- V6: string (nullable = true)
 |-- V7: string (nullable = true)
 |-- V8: string (nullable = true)
 |-- V9: string (nullable = true)
 |-- V10: string (nullable = true)
 |-- V11: string (nullable = true)
 |-- V12: string (nullable = true)
 |-- V13: string (nullable = true)
 |-- V14: string (nullable = true)
 |-- V15: string (nullable = true)
 |-- V16: string (nullable = true)
 |-- V17: string (nullable = true)
 |-- V18: string (nullable = true)
 |-- V19: string (nullable = true)
 |-- V20: string (nullable = true)
 |-- V21: string (nullable = true)
 |-- V22: string (nullable = true)
 |-- V23: string (nullable = true)
 |-- V24: string (nullable = true)
 |-- V25: string (nullable = true)
 |-- V26: string (nullable = true)
 |-- V27: string

Selecting specific columns using the traditional SQL select command


In [3]:
data_frame.select(data_frame['V11'], data_frame['V17']-5.0).show(5)

+------------------+------------------+
|               V11|       (V17 - 5.0)|
+------------------+------------------+
|-0.551599533260813|-4.792028758070758|
|  1.61272666105479|-5.114804663102346|
| 0.624501459424895| -3.89003062130401|
|-0.226487263835401|-5.684092786345479|
|-0.822842877946363|-5.237033239362776|
+------------------+------------------+
only showing top 5 rows



In [4]:
sliced_credit_card = data_frame.select(data_frame['V11'], data_frame['V17']-5.0)
sliced_credit_card.show()

+------------------+-------------------+
|               V11|        (V17 - 5.0)|
+------------------+-------------------+
|-0.551599533260813| -4.792028758070758|
|  1.61272666105479| -5.114804663102346|
| 0.624501459424895|  -3.89003062130401|
|-0.226487263835401| -5.684092786345479|
|-0.822842877946363| -5.237033239362776|
|  1.34126198001957| -5.058132823364013|
| -1.41690724314928| -4.997179487527653|
|-0.619467796121913|   -6.2221273453247|
|-0.705116586646536| -5.499767968800267|
|  1.01761446783262| -5.540979921943059|
|   1.1996439495421| -4.746585284136803|
|-0.259115563735702|-5.8099789259635894|
| 0.227666231237246| -4.126063552385561|
|-0.773656930526689|   -4.8759945848181|
| 0.844555470974377| -5.155868714793874|
|-0.793980602837221| -5.279265373246772|
|-0.450311279515466| -5.009212377727073|
|  0.32409781346169|  -5.92870926272403|
| 0.917229867699146| -5.725480944982201|
|  1.07754241162743| -4.695758581385647|
+------------------+-------------------+
only showing top

In [5]:
type(sliced_credit_card)

pyspark.sql.dataframe.DataFrame

To convert a spark dataframe to  Pandas


In [6]:
import pandas as pd

sliced_credit = sliced_credit_card.toPandas()
type(sliced_credit)

pandas.core.frame.DataFrame

Using the group by function


In [7]:
data_frame.select('V2', 'V3', 'V5', 'V7').orderBy('V3').show()

+------------------+--------------------+------------------+------------------+
|                V2|                  V3|                V5|                V7|
+------------------+--------------------+------------------+------------------+
| -1.38871082819139|-0.00010859127517...| -1.41492072744198|-0.457119649774328|
| -1.84948493639859|-0.00011670159095...| -1.19728040298154|-0.120453321663396|
| 0.186196995304608|-0.00017638604916...|  3.42746540667419| 0.137308571596394|
|-0.417284896874103|-0.00020070463513...|-0.693182937835376|-0.281240772583621|
| 0.414098144629859|-0.00020901858431...|  1.20605337449993|  1.08451867577036|
|-0.197058254955254|-0.00022689493209...|  1.81390525841971|  1.45308792456891|
| 0.230488756164041|-0.00023936616654...| 0.382902132748936|0.0611252613406751|
|-0.157070757658583|-0.00023955256168...| -0.40091068904697|  0.42439026993488|
|-0.646553190215571|-0.00024697440131...|-0.555782412817417| -0.69866094456342|
| 0.839594758150574|-0.00026916160835...

To run SQL queries directly we need to create an SQL temporary view. 

In [8]:
data_frame.createOrReplaceTempView('oganesson')
sql_type = sc.sql('SELECT V2, V3, V5, V11 FROM oganesson')
sql_type.show()


+-------------------+------------------+-------------------+------------------+
|                 V2|                V3|                 V5|               V11|
+-------------------+------------------+-------------------+------------------+
|-0.0727811733098497|  2.53634673796914| -0.338320769942518|-0.551599533260813|
|   0.26615071205963|  0.16648011335321| 0.0600176492822243|  1.61272666105479|
|  -1.34016307473609|  1.77320934263119| -0.503198133318193| 0.624501459424895|
| -0.185226008082898|  1.79299333957872|-0.0103088796030823|-0.226487263835401|
|  0.877736754848451|    1.548717846511| -0.407193377311653|-0.822842877946363|
|  0.960523044882985|  1.14110934232219|   0.42098688077219|  1.34126198001957|
|  0.141003507049326|0.0453707735899449|  0.191880988597645| -1.41690724314928|
|   1.41796354547385|   1.0743803763556|  0.948934094764157|-0.619467796121913|
|  0.286157196276544|-0.113192212729871|    2.6695986595986|-0.705116586646536|
|   1.11959337641566|  1.04436655157316|

In [9]:
type(sql_type)

pyspark.sql.dataframe.DataFrame

In [10]:
sql_changed = sql_type.toPandas()
type(sql_changed)

pandas.core.frame.DataFrame

If you need an RDD, you can use the command below to convert a dataframe to an RDD

In [10]:
rdd1 = data_frame.rdd
type(rdd1)

pyspark.rdd.RDD

In [11]:
rdd1.take(5)

[Row(Time='0', V1='-1.3598071336738', V2='-0.0727811733098497', V3='2.53634673796914', V4='1.37815522427443', V5='-0.338320769942518', V6='0.462387777762292', V7='0.239598554061257', V8='0.0986979012610507', V9='0.363786969611213', V10='0.0907941719789316', V11='-0.551599533260813', V12='-0.617800855762348', V13='-0.991389847235408', V14='-0.311169353699879', V15='1.46817697209427', V16='-0.470400525259478', V17='0.207971241929242', V18='0.0257905801985591', V19='0.403992960255733', V20='0.251412098239705', V21='-0.018306777944153', V22='0.277837575558899', V23='-0.110473910188767', V24='0.0669280749146731', V25='0.128539358273528', V26='-0.189114843888824', V27='0.133558376740387', V28='-0.0210530534538215', Amount='149.62', Class='0'),
 Row(Time='0', V1='1.19185711131486', V2='0.26615071205963', V3='0.16648011335321', V4='0.448154078460911', V5='0.0600176492822243', V6='-0.0823608088155687', V7='-0.0788029833323113', V8='0.0851016549148104', V9='-0.255425128109186', V10='-0.166974414

In [12]:
rdd1.count()


284807

In [38]:
rdd1.persist()

MapPartitionsRDD[42] at javaToPython at NativeMethodAccessorImpl.java:0

In [39]:
rdd1.unpersist()

MapPartitionsRDD[42] at javaToPython at NativeMethodAccessorImpl.java:0