# Chapter 4 : Spark Dataframes and Operations Code

Create Dataframe Operations

In [0]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
])


In [0]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

data_df = spark.createDataFrame([
    Row(col_1=100, col_2=200., col_3='string_test_1', col_4=date(2023, 1, 1), col_5=datetime(2023, 1, 1, 12, 0)),
    Row(col_1=200, col_2=300., col_3='string_test_2', col_4=date(2023, 2, 1), col_5=datetime(2023, 1, 2, 12, 0)),
    Row(col_1=400, col_2=500., col_3='string_test_3', col_4=date(2023, 3, 1), col_5=datetime(2023, 1, 3, 12, 0))
], schema=' col_1 long, col_2 double, col_3 string, col_4 date, col_5 timestamp')


In [0]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

pandas_df = pd.DataFrame({
    'col_1': [100, 200, 400],
    'col_2': [200., 300., 500.],
    'col_3': ['string_test_1', 'string_test_2', 'string_test_3'],
    'col_4': [date(2023, 1, 1), date(2023, 2, 1), date(2023, 3, 1)],
    'col_5': [datetime(2023, 1, 1, 12, 0), datetime(2023, 1, 2, 12, 0), datetime(2023, 1, 3, 12, 0)]
})
df = spark.createDataFrame(pandas_df)


In [0]:
import pandas as pd
from datetime import datetime, date
from pyspark.sql import Row

rdd = spark.sparkContext.parallelize([
    (100, 200., 'string_test_1', date(2023, 1, 1), datetime(2023, 1, 1, 12, 0)),
    (200, 300., 'string_test_2', date(2023, 2, 1), datetime(2023, 1, 2, 12, 0)),
    (300, 400., 'string_test_3', date(2023, 3, 1), datetime(2023, 1, 3, 12, 0))
])
data_df = spark.createDataFrame(rdd, schema=['col_1', 'col_2', 'col_3', 'col_4', 'col_5'])


How to View the Dataframes

In [0]:
data_df.show()

+-----+-----+-------------+----------+-------------------+
|col_1|col_2|        col_3|     col_4|              col_5|
+-----+-----+-------------+----------+-------------------+
|  100|200.0|string_test_1|2023-01-01|2023-01-01 12:00:00|
|  200|300.0|string_test_2|2023-02-01|2023-01-02 12:00:00|
|  300|400.0|string_test_3|2023-03-01|2023-01-03 12:00:00|
+-----+-----+-------------+----------+-------------------+



In [0]:
data_df.show(2)

+-----+-----+-------------+----------+-------------------+
|col_1|col_2|        col_3|     col_4|              col_5|
+-----+-----+-------------+----------+-------------------+
|  100|200.0|string_test_1|2023-01-01|2023-01-01 12:00:00|
|  200|300.0|string_test_2|2023-02-01|2023-01-02 12:00:00|
+-----+-----+-------------+----------+-------------------+
only showing top 2 rows



In [0]:
data_df.printSchema()

root
 |-- col_1: long (nullable = true)
 |-- col_2: double (nullable = true)
 |-- col_3: string (nullable = true)
 |-- col_4: date (nullable = true)
 |-- col_5: timestamp (nullable = true)



In [0]:
data_df.show(1, vertical=True)

-RECORD 0--------------------
 col_1 | 100                 
 col_2 | 200.0               
 col_3 | string_test_1       
 col_4 | 2023-01-01          
 col_5 | 2023-01-01 12:00:00 
only showing top 1 row



In [0]:
data_df.columns

['col_1', 'col_2', 'col_3', 'col_4', 'col_5']

In [0]:
data_df.count()

3

In [0]:
data_df.select('col_1', 'col_2', 'col_3').describe().show()

+-------+-----+-----+-------------+
|summary|col_1|col_2|        col_3|
+-------+-----+-----+-------------+
|  count|    3|    3|            3|
|   mean|200.0|300.0|         NULL|
| stddev|100.0|100.0|         NULL|
|    min|  100|200.0|string_test_1|
|    max|  300|400.0|string_test_3|
+-------+-----+-----+-------------+



In [0]:
data_df.collect()

[Row(col_1=100, col_2=200.0, col_3='string_test_1', col_4=datetime.date(2023, 1, 1), col_5=datetime.datetime(2023, 1, 1, 12, 0)),
 Row(col_1=200, col_2=300.0, col_3='string_test_2', col_4=datetime.date(2023, 2, 1), col_5=datetime.datetime(2023, 1, 2, 12, 0)),
 Row(col_1=400, col_2=500.0, col_3='string_test_3', col_4=datetime.date(2023, 3, 1), col_5=datetime.datetime(2023, 1, 3, 12, 0))]

In [0]:
data_df.take(1)

[Row(col_1=100, col_2=200.0, col_3='string_test_1', col_4=datetime.date(2023, 1, 1), col_5=datetime.datetime(2023, 1, 1, 12, 0))]

In [0]:
data_df.tail(1)

[Row(col_1=400, col_2=500.0, col_3='string_test_3', col_4=datetime.date(2023, 3, 1), col_5=datetime.datetime(2023, 1, 3, 12, 0))]

In [0]:
data_df.head(1)

[Row(col_1=100, col_2=200.0, col_3='string_test_1', col_4=datetime.date(2023, 1, 1), col_5=datetime.datetime(2023, 1, 1, 12, 0))]

In [0]:
#  error here
data_df.toPandas()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5
0,100,200.0,string_test_1,2023-01-01,2023-01-01 12:00:00
1,200,300.0,string_test_2,2023-02-01,2023-01-02 12:00:00
2,400,500.0,string_test_3,2023-03-01,2023-01-03 12:00:00


How to do Data Manipulation - Rows and Columns

In [0]:
from pyspark.sql import Column

data_df.select(data_df.col_3).show()


+-------------+
|        col_3|
+-------------+
|string_test_1|
|string_test_2|
|string_test_3|
+-------------+



In [0]:
from pyspark.sql import functions as F
data_df = data_df.withColumn("col_6", F.lit("A"))
data_df.show()


+-----+-----+-------------+----------+-------------------+-----+
|col_1|col_2|        col_3|     col_4|              col_5|col_6|
+-----+-----+-------------+----------+-------------------+-----+
|  100|200.0|string_test_1|2023-01-01|2023-01-01 12:00:00|    A|
|  200|300.0|string_test_2|2023-02-01|2023-01-02 12:00:00|    A|
|  400|500.0|string_test_3|2023-03-01|2023-01-03 12:00:00|    A|
+-----+-----+-------------+----------+-------------------+-----+



In [0]:
data_df = data_df.drop("col_5")
data_df.show()


+-----+-----+-------------+----------+-----+
|col_1|col_2|        col_3|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  400|500.0|string_test_3|2023-03-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df.withColumn("col_2", F.col("col_2") / 100).show()

+-----+-----+-------------+----------+-----+
|col_1|col_2|        col_3|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|  2.0|string_test_1|2023-01-01|    A|
|  200|  3.0|string_test_2|2023-02-01|    A|
|  400|  5.0|string_test_3|2023-03-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df = data_df.withColumnRenamed("col_3", "string_col")
data_df.show()


+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  400|500.0|string_test_3|2023-03-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df.select("col_6").distinct().show()

+-----+
|col_6|
+-----+
|    A|
+-----+



In [0]:
data_df.select(F.countDistinct("col_6").alias("Total_Unique")).show()

+------------+
|Total_Unique|
+------------+
|           1|
+------------+



In [0]:
from pyspark.sql.functions import upper

data_df.withColumn('upper_string_col', upper(data_df.string_col)).show()


+-----+-----+-------------+----------+-----+----------------+
|col_1|col_2|   string_col|     col_4|col_6|upper_string_col|
+-----+-----+-------------+----------+-----+----------------+
|  100|200.0|string_test_1|2023-01-01|    A|   STRING_TEST_1|
|  200|300.0|string_test_2|2023-02-01|    A|   STRING_TEST_2|
|  400|500.0|string_test_3|2023-03-01|    A|   STRING_TEST_3|
+-----+-----+-------------+----------+-----+----------------+



In [0]:
data_df.filter(data_df.col_1 == 100).show()

+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df.filter((data_df.col_1 == 100)
		& (data_df.col_6 == 'A')).show()


+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df.filter((data_df.col_1 == 100)
		| (data_df.col_2 == 300.00)).show()


+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
list = [100, 200]
data_df.filter(data_df.col_1.isin(list)).show()


+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
#  error here
from pyspark.sql.functions import col
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType

data_df_2 = data_df.withColumn("col_4",col("col_4").cast(StringType())) \
    .withColumn("col_1",col("col_1").cast(IntegerType()))
data_df_2.printSchema()
data_df.show()



root
 |-- col_1: integer (nullable = true)
 |-- col_2: double (nullable = true)
 |-- string_col: string (nullable = true)
 |-- col_4: string (nullable = true)
 |-- col_6: string (nullable = false)

+-----+-----+-------------+----------+-----+
|col_1|col_2|   string_col|     col_4|col_6|
+-----+-----+-------------+----------+-----+
|  100|200.0|string_test_1|2023-01-01|    A|
|  200|300.0|string_test_2|2023-02-01|    A|
|  400|500.0|string_test_3|2023-03-01|    A|
+-----+-----+-------------+----------+-----+



In [0]:
data_df_3 = data_df_2.selectExpr("cast(col_4 as date) col_4",
    "cast(col_1 as long) col_1")
data_df_3.printSchema()


root
 |-- col_4: date (nullable = true)
 |-- col_1: long (nullable = true)



In [0]:
data_df_3.createOrReplaceTempView("CastExample")
data_df_4 = spark.sql("SELECT DOUBLE(col_1), DATE(col_4) from CastExample")
data_df_4.printSchema()
data_df_4.show(truncate=False)


root
 |-- col_1: double (nullable = true)
 |-- col_4: date (nullable = true)

+-----+----------+
|col_1|col_4     |
+-----+----------+
|100.0|2023-01-01|
|200.0|2023-02-01|
|400.0|2023-03-01|
+-----+----------+



In [0]:
salary_data = [("John", "Field-eng", 3500), 
    ("Michael", "Field-eng", 4500), 
    ("Robert", None, 4000), 
    ("Maria", "Finance", 3500), 
    ("John", "Sales", 3000), 
    ("Kelly", "Finance", 3500), 
    ("Kate", "Finance", 3000), 
    ("Martin", None, 3500), 
    ("Kiran", "Sales", 2200), 
    ("Michael", "Field-eng", 4500) 
  ]
columns= ["Employee", "Department", "Salary"]
salary_data = spark.createDataFrame(data = salary_data, schema = columns)
salary_data.printSchema()
salary_data.show()


root
 |-- Employee: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|  Robert|      NULL|  4000|
|   Maria|   Finance|  3500|
|    John|     Sales|  3000|
|   Kelly|   Finance|  3500|
|    Kate|   Finance|  3000|
|  Martin|      NULL|  3500|
|   Kiran|     Sales|  2200|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [0]:
salary_data.dropna().show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|   Maria|   Finance|  3500|
|    John|     Sales|  3000|
|   Kelly|   Finance|  3500|
|    Kate|   Finance|  3000|
|   Kiran|     Sales|  2200|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [0]:
new_salary_data = salary_data.dropDuplicates().show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|    John| Field-eng|  3500|
| Michael| Field-eng|  4500|
|  Robert|      NULL|  4000|
|    John|     Sales|  3000|
|   Maria|   Finance|  3500|
|    Kate|   Finance|  3000|
|   Kelly|   Finance|  3500|
|   Kiran|     Sales|  2200|
|  Martin|      NULL|  3500|
+--------+----------+------+



Using Aggregrates in a Dataframe

In [0]:
from pyspark.sql.functions import countDistinct, avg
salary_data.select(avg('Salary')).show()


+-----------+
|avg(Salary)|
+-----------+
|     3520.0|
+-----------+



In [0]:
salary_data.agg({'Salary':'count'}).show()

+-------------+
|count(Salary)|
+-------------+
|           10|
+-------------+



In [0]:
salary_data.select(countDistinct("Salary").alias("Distinct Salary")).show()

+---------------+
|Distinct Salary|
+---------------+
|              5|
+---------------+



In [0]:
salary_data.agg({'Salary':'max'}).show() 

+-----------+
|max(Salary)|
+-----------+
|       4500|
+-----------+



In [0]:
salary_data.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      35200|
+-----------+



In [0]:
salary_data.orderBy("Salary").show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
|   Kiran|     Sales|  2200|
|    John|     Sales|  3000|
|    Kate|   Finance|  3000|
|    John| Field-eng|  3500|
|   Maria|   Finance|  3500|
|   Kelly|   Finance|  3500|
|  Martin|      NULL|  3500|
|  Robert|      NULL|  4000|
| Michael| Field-eng|  4500|
| Michael| Field-eng|  4500|
+--------+----------+------+



In [0]:
salary_data.orderBy(salary_data["Salary"].desc()).show()

+--------+----------+------+
|Employee|Department|Salary|
+--------+----------+------+
| Michael| Field-eng|  4500|
| Michael| Field-eng|  4500|
|  Robert|      NULL|  4000|
|    John| Field-eng|  3500|
|   Maria|   Finance|  3500|
|   Kelly|   Finance|  3500|
|  Martin|      NULL|  3500|
|    John|     Sales|  3000|
|    Kate|   Finance|  3000|
|   Kiran|     Sales|  2200|
+--------+----------+------+



In [0]:
salary_data.groupby('Department')

GroupedData[grouping expressions: [Department], value: [Employee: string, Department: string, Salary: bigint], type: GroupBy]

In [0]:
salary_data.groupby('Department').avg().show()

+----------+------------------+
|Department|       avg(Salary)|
+----------+------------------+
| Field-eng| 4166.666666666667|
|     Sales|            2600.0|
|      NULL|            3750.0|
|   Finance|3333.3333333333335|
+----------+------------------+



In [0]:
# error here
from pyspark.sql.functions import col, round

salary_data.groupBy('Department')\
  .sum('Salary')\
  .withColumn('sum(Salary)',round(col('sum(Salary)'), 2))\
  .withColumnRenamed('sum(Salary)', 'Salary')\
  .orderBy('Department')\
  .show()


[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-4145428256909246>, line 3[0m
[1;32m      1[0m salary_data[38;5;241m.[39mgroupBy([38;5;124m'[39m[38;5;124mDepartment[39m[38;5;124m'[39m)\
[1;32m      2[0m   [38;5;241m.[39msum([38;5;124m'[39m[38;5;124mSalary[39m[38;5;124m'[39m)\
[0;32m----> 3[0m   [38;5;241m.[39mwithColumn([38;5;124m'[39m[38;5;124msum(Salary)[39m[38;5;124m'[39m,[38;5;28;43mround[39;49m[43m([49m[43mcol[49m[43m([49m[38;5;124;43m'[39;49m[38;5;124;43msum(Salary)[39;49m[38;5;124;43m'[39;49m[43m)[49m[43m,[49m[43m [49m[38;5;241;43m2[39;49m[43m)[49m)\
[1;32m      4[0m   [38;5;241m.[39mwithColumnRenamed([38;5;124m'[39m[38;5;124msum(Salary)[39m[38;5;124m'[39m, [38;5;124m'[39m[38;5;124mSalary[39m[38;5;124m'[39m)\
[1;32m      5[0m   [38;5;241m.[39morderBy([38;5;124m

In [0]:
salary_data_with_id = [(1, "John", "Field-eng", 3500), \
    (2, "Robert", "Sales", 4000), \
    (3, "Maria", "Finance", 3500), \
    (4, "Michael", "Sales", 3000), \
    (5, "Kelly", "Finance", 3500), \
    (6, "Kate", "Finance", 3000), \
    (7, "Martin", "Finance", 3500), \
    (8, "Kiran", "Sales", 2200), \
  ]
columns= ["ID", "Employee", "Department", "Salary"]
salary_data_with_id = spark.createDataFrame(data = salary_data_with_id, schema = columns)
salary_data_with_id.show()


+---+--------+----------+------+
| ID|Employee|Department|Salary|
+---+--------+----------+------+
|  1|    John| Field-eng|  3500|
|  2|  Robert|     Sales|  4000|
|  3|   Maria|   Finance|  3500|
|  4| Michael|     Sales|  3000|
|  5|   Kelly|   Finance|  3500|
|  6|    Kate|   Finance|  3000|
|  7|  Martin|   Finance|  3500|
|  8|   Kiran|     Sales|  2200|
+---+--------+----------+------+



In [0]:
employee_data = [(1, "NY", "M"), \
    (2, "NC", "M"), \
    (3, "NY", "F"), \
    (4, "TX", "M"), \
    (5, "NY", "F"), \
    (6, "AZ", "F") \
  ]
columns= ["ID", "State", "Gender"]
employee_data = spark.createDataFrame(data = employee_data, schema = columns)
employee_data.show()


+---+-----+------+
| ID|State|Gender|
+---+-----+------+
|  1|   NY|     M|
|  2|   NC|     M|
|  3|   NY|     F|
|  4|   TX|     M|
|  5|   NY|     F|
|  6|   AZ|     F|
+---+-----+------+



In [0]:
salary_data_with_id.join(employee_data,salary_data_with_id.ID ==  employee_data.ID,"inner").show()

+---+--------+----------+------+---+-----+------+
| ID|Employee|Department|Salary| ID|State|Gender|
+---+--------+----------+------+---+-----+------+
|  1|    John| Field-eng|  3500|  1|   NY|     M|
|  2|  Robert|     Sales|  4000|  2|   NC|     M|
|  3|   Maria|   Finance|  3500|  3|   NY|     F|
|  4| Michael|     Sales|  3000|  4|   TX|     M|
|  5|   Kelly|   Finance|  3500|  5|   NY|     F|
|  6|    Kate|   Finance|  3000|  6|   AZ|     F|
+---+--------+----------+------+---+-----+------+



In [0]:
salary_data_with_id.join(employee_data,salary_data_with_id.ID ==  employee_data.ID,"outer").show()

+---+--------+----------+------+----+-----+------+
| ID|Employee|Department|Salary|  ID|State|Gender|
+---+--------+----------+------+----+-----+------+
|  1|    John| Field-eng|  3500|   1|   NY|     M|
|  2|  Robert|     Sales|  4000|   2|   NC|     M|
|  3|   Maria|   Finance|  3500|   3|   NY|     F|
|  4| Michael|     Sales|  3000|   4|   TX|     M|
|  5|   Kelly|   Finance|  3500|   5|   NY|     F|
|  6|    Kate|   Finance|  3000|   6|   AZ|     F|
|  7|  Martin|   Finance|  3500|NULL| NULL|  NULL|
|  8|   Kiran|     Sales|  2200|NULL| NULL|  NULL|
+---+--------+----------+------+----+-----+------+



In [0]:
salary_data_with_id.join(employee_data,salary_data_with_id.ID ==  employee_data.ID,"left").show()

+---+--------+----------+------+----+-----+------+
| ID|Employee|Department|Salary|  ID|State|Gender|
+---+--------+----------+------+----+-----+------+
|  1|    John| Field-eng|  3500|   1|   NY|     M|
|  2|  Robert|     Sales|  4000|   2|   NC|     M|
|  3|   Maria|   Finance|  3500|   3|   NY|     F|
|  4| Michael|     Sales|  3000|   4|   TX|     M|
|  5|   Kelly|   Finance|  3500|   5|   NY|     F|
|  6|    Kate|   Finance|  3000|   6|   AZ|     F|
|  7|  Martin|   Finance|  3500|NULL| NULL|  NULL|
|  8|   Kiran|     Sales|  2200|NULL| NULL|  NULL|
+---+--------+----------+------+----+-----+------+



In [0]:
salary_data_with_id.join(employee_data,salary_data_with_id.ID ==  employee_data.ID,"right").show()

+---+--------+----------+------+---+-----+------+
| ID|Employee|Department|Salary| ID|State|Gender|
+---+--------+----------+------+---+-----+------+
|  1|    John| Field-eng|  3500|  1|   NY|     M|
|  2|  Robert|     Sales|  4000|  2|   NC|     M|
|  3|   Maria|   Finance|  3500|  3|   NY|     F|
|  4| Michael|     Sales|  3000|  4|   TX|     M|
|  5|   Kelly|   Finance|  3500|  5|   NY|     F|
|  6|    Kate|   Finance|  3000|  6|   AZ|     F|
+---+--------+----------+------+---+-----+------+



In [0]:
salary_data_with_id_2 = [(1, "John", "Field-eng", 3500), \
    (2, "Robert", "Sales", 4000), \
    (3, "Aliya", "Finance", 3500), \
    (4, "Nate", "Sales", 3000), \
  ]
columns2= ["ID", "Employee", "Department", "Salary"]

salary_data_with_id_2 = spark.createDataFrame(data = salary_data_with_id_2, schema = columns2)

salary_data_with_id_2.printSchema()
salary_data_with_id_2.show(truncate=False)



root
 |-- ID: long (nullable = true)
 |-- Employee: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: long (nullable = true)

+---+--------+----------+------+
|ID |Employee|Department|Salary|
+---+--------+----------+------+
|1  |John    |Field-eng |3500  |
|2  |Robert  |Sales     |4000  |
|3  |Aliya   |Finance   |3500  |
|4  |Nate    |Sales     |3000  |
+---+--------+----------+------+



In [0]:
unionDF = salary_data_with_id.union(salary_data_with_id_2)
unionDF.show(truncate=False)


+---+--------+----------+------+
|ID |Employee|Department|Salary|
+---+--------+----------+------+
|1  |John    |Field-eng |3500  |
|2  |Robert  |Sales     |4000  |
|3  |Maria   |Finance   |3500  |
|4  |Michael |Sales     |3000  |
|5  |Kelly   |Finance   |3500  |
|6  |Kate    |Finance   |3000  |
|7  |Martin  |Finance   |3500  |
|8  |Kiran   |Sales     |2200  |
|1  |John    |Field-eng |3500  |
|2  |Robert  |Sales     |4000  |
|3  |Aliya   |Finance   |3500  |
|4  |Nate    |Sales     |3000  |
+---+--------+----------+------+



Reading and Writing Data

In [0]:
# error here
salary_data_with_id.write.csv('salary_data.csv', header=True)
spark.read.csv('/salary_data.csv', header=True).show()


2024-05-05 00:16:22,603 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1543, in _execute_and_fetch_as_iterator
    for b in generator:
  File "/usr/lib/python3.10/_collections_abc.py", line 330, in __next__
    return self.send(None)
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 135, in send
    if not self._has_next():
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 196, in _has_next
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 168, in _has_next
    self._current = self._call_iter(
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 288, in _call_iter
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 271, in _call_iter
    return iter_fun()
  File "/databricks/spark/python/pyspark/sql/connect/clien

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-4145428256909257>, line 2[0m
[1;32m      1[0m salary_data_with_id[38;5;241m.[39mwrite[38;5;241m.[39mcsv([38;5;124m'[39m[38;5;124msalary_data.csv[39m[38;5;124m'[39m, header[38;5;241m=[39m[38;5;28;01mTrue[39;00m)
[0;32m----> 2[0m spark[38;5;241m.[39mread[38;5;241m.[39mcsv([38;5;124m'[39m[38;5;124msalary_data.csv[39m[38;5;124m'[39m, header[38;5;241m=[39m[38;5;28;01mTrue[39;00m)[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/sql/connect/dataframe.py:1158[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1157[0m [38;5;28;01mdef[39;00m [38;5;21mshow[39m([38;5;28mself[39m, n: [38;5;28mint[39m [38;5;241m=[39m [38;5;241m20[39m, truncate: Union[[38;5;28mbool[39m, [38;5;28mint[39m] [38;5;241m=[39m [38;

In [0]:
from pyspark.sql.types import *

filePath = 'salary_data.csv'
columns= ["ID", "State", "Gender"] 
schema = StructType([
      StructField("ID", IntegerType(),True),
  StructField("State",  StringType(),True),
  StructField("Gender",  StringType(),True)
])
 
read_data = spark.read.format("csv").option("header","true").schema(schema).load(filePath)
read_data.show()


2024-05-05 00:18:27,594 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1543, in _execute_and_fetch_as_iterator
    for b in generator:
  File "/usr/lib/python3.10/_collections_abc.py", line 330, in __next__
    return self.send(None)
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 135, in send
    if not self._has_next():
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 196, in _has_next
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 168, in _has_next
    self._current = self._call_iter(
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 288, in _call_iter
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 271, in _call_iter
    return iter_fun()
  File "/databricks/spark/python/pyspark/sql/connect/clien

Error in callback <bound method UserNamespaceCommandHook.post_run_cell of <dbruntime.DatasetInfo.UserNamespaceCommandHook object at 0x7fe41c339c00>> (for post_run_cell):


2024-05-05 00:18:27,888 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1389, in _analyze
    resp = self._stub.AnalyzePlan(req, metadata=self._builder.metadata())
  File "/databricks/python/lib/python3.10/site-packages/grpc/_channel.py", line 946, in __call__
    return _end_unary_response_blocking(state, call, False, None)
  File "/databricks/python/lib/python3.10/site-packages/grpc/_channel.py", line 849, in _end_unary_response_blocking
    raise _InactiveRpcError(state)
grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.INTERNAL
	details = "Path must be absolute: salary_data.csv"
	debug_error_string = "UNKNOWN:Error received from peer unix:/databricks/sparkconnect/grpc.sock {grpc_message:"Path must be absolute: salary_data.csv", grpc_status:13, created_time:"2024-05-05T00:18:27.888074753+00:00"}"
>


[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m/databricks/python_shell/dbruntime/DatasetInfo.py:22[0m, in [0;36mUserNamespaceCommandHook.post_run_cell[0;34m(self, result)[0m
[1;32m     21[0m [38;5;28;01mdef[39;00m [38;5;21mpost_run_cell[39m([38;5;28mself[39m, result):
[0;32m---> 22[0m     new_dataframe_info [38;5;241m=[39m [38;5;28;43mself[39;49m[38;5;241;43m.[39;49m[43muser_ns[49m[38;5;241;43m.[39;49m[43mget_new_dataframe_infos_json[49m[43m([49m[43m)[49m
[1;32m     23[0m     [38;5;28;01mif[39;00m new_dataframe_info:
[1;32m     24[0m         data [38;5;241m=[39m {[38;5;124m"[39m[38;5;124mapplication/vnd.databricks.v1+datasetInfo[39m[38;5;124m"[39m: new_dataframe_info}

File [0;32m/databricks/python_shell/dbruntime/DatasetInfo.py:135[0m, in [0;36mUserNamespaceDict.get_new_dataframe_infos_json[0;34m(sel

In [0]:
salary_data_with_id.write.parquet('salary_data.parquet')
spark.read.parquet('salary_data.parquet').show()


2024-05-05 00:18:50,805 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1543, in _execute_and_fetch_as_iterator
    for b in generator:
  File "/usr/lib/python3.10/_collections_abc.py", line 330, in __next__
    return self.send(None)
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 135, in send
    if not self._has_next():
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 196, in _has_next
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 168, in _has_next
    self._current = self._call_iter(
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 288, in _call_iter
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 271, in _call_iter
    return iter_fun()
  File "/databricks/spark/python/pyspark/sql/connect/clien

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-4145428256909259>, line 2[0m
[1;32m      1[0m salary_data_with_id[38;5;241m.[39mwrite[38;5;241m.[39mparquet([38;5;124m'[39m[38;5;124msalary_data.parquet[39m[38;5;124m'[39m)
[0;32m----> 2[0m spark[38;5;241m.[39mread[38;5;241m.[39mparquet([38;5;124m'[39m[38;5;124msalary_data.parquet[39m[38;5;124m'[39m)[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/sql/connect/dataframe.py:1158[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1157[0m [38;5;28;01mdef[39;00m [38;5;21mshow[39m([38;5;28mself[39m, n: [38;5;28mint[39m [38;5;241m=[39m [38;5;241m20[39m, truncate: Union[[38;5;28mbool[39m, [38;5;28mint[39m] [38;5;241m=[39m [38;5;28;01mTrue[39;00m, vertical: [38;5;28mbool[39m [38;5;241m=[39m [38;5;28;01mF

In [0]:
salary_data_with_id.write.orc('salary_data.orc')
spark.read.orc('salary_data.orc').show()

2024-05-05 00:20:59,682 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1543, in _execute_and_fetch_as_iterator
    for b in generator:
  File "/usr/lib/python3.10/_collections_abc.py", line 330, in __next__
    return self.send(None)
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 135, in send
    if not self._has_next():
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 196, in _has_next
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 168, in _has_next
    self._current = self._call_iter(
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 288, in _call_iter
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 271, in _call_iter
    return iter_fun()
  File "/databricks/spark/python/pyspark/sql/connect/clien

[0;31m---------------------------------------------------------------------------[0m
[0;31mIllegalArgumentException[0m                  Traceback (most recent call last)
File [0;32m<command-4145428256909260>, line 2[0m
[1;32m      1[0m salary_data_with_id[38;5;241m.[39mwrite[38;5;241m.[39morc([38;5;124m'[39m[38;5;124msalary_data.orc[39m[38;5;124m'[39m)
[0;32m----> 2[0m spark[38;5;241m.[39mread[38;5;241m.[39morc([38;5;124m'[39m[38;5;124msalary_data.orc[39m[38;5;124m'[39m)[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/sql/connect/dataframe.py:1158[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1157[0m [38;5;28;01mdef[39;00m [38;5;21mshow[39m([38;5;28mself[39m, n: [38;5;28mint[39m [38;5;241m=[39m [38;5;241m20[39m, truncate: Union[[38;5;28mbool[39m, [38;5;28mint[39m] [38;5;241m=[39m [38;5;28;01mTrue[39;00m, vertical: [38;5;28mbool[39m [38;5;241m=[39m [38;5;28;01mFalse[39;00m) [

In [0]:
salary_data_with_id.write.format("delta").save("/FileStore/tables/salary_data_with_id")
df = spark.read.load("/FileStore/tables/salary_data_with_id")
df.show()


2024-05-05 00:20:49,787 269964 ERROR _handle_rpc_error GRPC Error received
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/sql/connect/client/core.py", line 1543, in _execute_and_fetch_as_iterator
    for b in generator:
  File "/usr/lib/python3.10/_collections_abc.py", line 330, in __next__
    return self.send(None)
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 135, in send
    if not self._has_next():
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 196, in _has_next
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 168, in _has_next
    self._current = self._call_iter(
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 288, in _call_iter
    raise e
  File "/databricks/spark/python/pyspark/sql/connect/client/reattach.py", line 271, in _call_iter
    return iter_fun()
  File "/databricks/spark/python/pyspark/sql/connect/clien

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4145428256909265>, line 1[0m
[0;32m----> 1[0m salary_data_with_id[38;5;241m.[39mwrite[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mdelta[39m[38;5;124m"[39m)[38;5;241m.[39msave([38;5;124m"[39m[38;5;124m/FileStore/tables/salary_data_with_id[39m[38;5;124m"[39m)
[1;32m      2[0m df [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mload([38;5;124m"[39m[38;5;124m/FileStore/tables/salary_data_with_id[39m[38;5;124m"[39m)
[1;32m      3[0m df[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/sql/connect/readwriter.py:670[0m, in [0;36mDataFrameWriter.save[0;34m(self, path, format, mode, partitionBy, **options)[0m
[1;32m    668[0m     [38;5;28mself[39m[38;5;241m.[39mformat([38;5;28mformat[39m)
[1;32m    669[0m [38;5;28mself[39m[

In [0]:
salary_data_with_id.createOrReplaceTempView("SalaryTable")
spark.sql("SELECT count(*) from SalaryTable").show()


+--------+
|count(1)|
+--------+
|       8|
+--------+

