# Spark DataFrame Basic Operations

**Imports**

In [2]:
import findspark
findspark.init('/home/sedat/spark-3.3.2-bin-hadoop3')
from pyspark.sql import SparkSession

**Creating SparkSession and read csv**

In [4]:
spark = SparkSession.builder.appName('ops').getOrCreate()

In [10]:
df = spark.read.csv('appl_stock.csv', inferSchema=True, header=True)

In [12]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [11]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [16]:
df.head(2)[0]

Row(Date=datetime.datetime(2010, 1, 4, 0, 0), Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)

**Total data**

In [34]:
df.count()

1762

**Filtering data**

In [18]:
df.filter('Close < 500').show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [25]:
df.filter('Close < 500').select('Open').show()
# df.filter(df['Close'] < 500).select('Open').show() -----> Gives same results

+------------------+
|              Open|
+------------------+
|        213.429998|
|        214.599998|
|        214.379993|
|            211.75|
|        210.299994|
|212.79999700000002|
|209.18999499999998|
|        207.870005|
|210.11000299999998|
|210.92999500000002|
|        208.330002|
|        214.910006|
|        212.079994|
|206.78000600000001|
|202.51000200000001|
|205.95000100000001|
|        206.849995|
|        204.930004|
|        201.079996|
|192.36999699999998|
+------------------+
only showing top 20 rows



In [27]:
df.filter('Close < 500').select(['Open', 'Close', 'Volume']).show()
# df.filter(df['Close'] < 500).select(['Open', 'Close', 'Volume']).show() -----> Gives same results

+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        213.429998|        214.009998|123432400|
|        214.599998|        214.379993|150476200|
|        214.379993|        210.969995|138040000|
|            211.75|            210.58|119282800|
|        210.299994|211.98000499999998|111902700|
|212.79999700000002|210.11000299999998|115557400|
|209.18999499999998|        207.720001|148614900|
|        207.870005|        210.650002|151473000|
|210.11000299999998|            209.43|108223500|
|210.92999500000002|            205.93|148516900|
|        208.330002|        215.039995|182501900|
|        214.910006|            211.73|153038200|
|        212.079994|        208.069996|152038600|
|206.78000600000001|            197.75|220441900|
|202.51000200000001|        203.070002|266424900|
|205.95000100000001|        205.940001|466777500|
|        206.849995|        207.880005|430642100|


In [32]:
df.filter('Close < 500 AND Open > 250').select(['Open', 'Close', 'Volume']).show()
# df.filter((df['Close'] < 500) & (df['Open'] > 250)).select(['Open', 'Close', 'Volume']).show()

+------------------+------------------+---------+
|              Open|             Close|   Volume|
+------------------+------------------+---------+
|        258.799995|259.22000499999996|245597800|
|        258.240009|266.46999700000003|198356200|
|        267.990005|         270.82999|199238900|
|        271.880001|             269.5|119767200|
|        267.269989|        262.039989|177335900|
|        263.250004|        261.600006|189600600|
|        263.020012|268.64000699999997|139710200|
|        269.310009|        261.090008|135615900|
|        263.840004|266.34999500000004|113585500|
|        262.889996|        258.679993|180954900|
|         253.03001|255.98999799999999|220775800|
|        253.830002|        246.249989|321465200|
|            250.25|253.99000499999997|246076600|
|251.83999599999999|256.51999700000005|212226700|
|        259.240005|        262.090004|163594900|
|         263.21999|        258.360012|149928100|
|        255.159992|        253.819988|189840700|


In [35]:
df.filter(df['Low'] == 197.16).show()

+-------------------+------------------+----------+------+------+---------+---------+
|               Date|              Open|      High|   Low| Close|   Volume|Adj Close|
+-------------------+------------------+----------+------+------+---------+---------+
|2010-01-22 00:00:00|206.78000600000001|207.499996|197.16|197.75|220441900|25.620401|
+-------------------+------------------+----------+------+------+---------+---------+



In [39]:
result = df.filter(df['Low'] == 197.16).collect()

In [41]:
row = result[0]

In [44]:
row.asDict()

{'Date': datetime.datetime(2010, 1, 22, 0, 0),
 'Open': 206.78000600000001,
 'High': 207.499996,
 'Low': 197.16,
 'Close': 197.75,
 'Volume': 220441900,
 'Adj Close': 25.620401}

In [45]:
row['Date']

datetime.datetime(2010, 1, 22, 0, 0)