#Installing Libraries and Importing

In [18]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan,when,count,col,lit
from pyspark.sql.types import *
import pandas as pd
import numpy as np
spark = SparkSession.builder \
    .master("local") \
    .appName("sql_learning") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()



#Creating dataframe from Pandas dataframe

In [19]:

df_employee=pd.read_csv("https://gist.githubusercontent.com/kevin336/acbb2271e66c10a5b73aacf82ca82784/raw/e38afe62e088394d61ed30884dd50a6826eee0a8/employees.csv")
def create_hive_table(dataframe,Tablename='default_table'):
  structure=pd.DataFrame(list(dataframe.columns),columns=['columnname'])
  structure.loc[:,'datatype']=dataframe.dtypes.to_dict().values()
  structure.loc[:,'flag']=True
  structure['datatype_new']=np.where(structure['datatype']=='object',StringType(),0)
  structure['datatype_new']=np.where(structure['datatype']=='float64',FloatType(),structure['datatype_new'])
  structure['datatype_new']=np.where(structure['datatype']=='int64',IntegerType(),structure['datatype_new'])
  structure=structure[['columnname','datatype_new','flag']]
  xlist=[StructField(structure['columnname'][i],structure['datatype_new'][i],True) for i in range(len(structure))]
  sparkdf=spark.createDataFrame(dataframe,schema=StructType(xlist))
  sparkdf.createOrReplaceTempView(f'{Tablename}')

#create employee table
create_hive_table(df_employee,Tablename='employee')

#Creating dataframe using spark read csv

In [4]:
df_employee.to_csv('employee.csv')

In [8]:
df_employee=spark.read.csv("/content/employee.csv",header=True,inferSchema=True)
df_employee.show(5)

+---+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|_c0|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+---+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|  0|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|  1|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|  2|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03| AD_ASST|  4400|            - |       101|           10|
|  3|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|  MK_MAN| 13000|            - |       100|           20|
|  4|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|  MK_REP|  6000|            - |       201

#Creating dataframe from spark rdd

In [9]:
# Create an RDD
data = [("Alice", 29), ("Bob", 34), ("Cathy", 28)]
rdd = spark.sparkContext.parallelize(data)

# Define a schema
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True)
])

# Convert RDD to DataFrame using the schema
df_rdd = spark.createDataFrame(rdd, schema)

# Show the DataFrame
df_rdd.show()

+-----+---+
| Name|Age|
+-----+---+
|Alice| 29|
|  Bob| 34|
|Cathy| 28|
+-----+---+



In [10]:
df_pyspark=spark.table('employee')

In [None]:
### Check the schema
df_pyspark.printSchema()

root
 |-- EMPLOYEE_ID: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- EMAIL: string (nullable = true)
 |-- PHONE_NUMBER: string (nullable = true)
 |-- HIRE_DATE: string (nullable = true)
 |-- JOB_ID: string (nullable = true)
 |-- SALARY: integer (nullable = true)
 |-- COMMISSION_PCT: string (nullable = true)
 |-- MANAGER_ID: string (nullable = true)
 |-- DEPARTMENT_ID: integer (nullable = true)



In [None]:
#type of the data
type(df_pyspark)

In [11]:
#description of the data
df_pyspark.describe().show()

+-------+------------------+----------+---------+------+------------+---------+----------+-----------------+--------------+------------------+-----------------+
|summary|       EMPLOYEE_ID|FIRST_NAME|LAST_NAME| EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|           SALARY|COMMISSION_PCT|        MANAGER_ID|    DEPARTMENT_ID|
+-------+------------------+----------+---------+------+------------+---------+----------+-----------------+--------------+------------------+-----------------+
|  count|                50|        50|       50|    50|          50|       50|        50|               50|            50|                50|               50|
|   mean|            134.76|      NULL|     NULL|  NULL|        NULL|     NULL|      NULL|          6182.32|          NULL|114.83673469387755|             57.6|
| stddev|33.631593504213456|      NULL|     NULL|  NULL|        NULL|     NULL|      NULL|4586.181771631927|          NULL|20.591611296406914|25.11686968666962|
|    min|               100|      

In [13]:
#1st three data of the dataframe
df_pyspark.head(3)

[Row(EMPLOYEE_ID=198, FIRST_NAME='Donald', LAST_NAME='OConnell', EMAIL='DOCONNEL', PHONE_NUMBER='650.507.9833', HIRE_DATE='21-JUN-07', JOB_ID='SH_CLERK', SALARY=2600, COMMISSION_PCT=' - ', MANAGER_ID='124', DEPARTMENT_ID=50),
 Row(EMPLOYEE_ID=199, FIRST_NAME='Douglas', LAST_NAME='Grant', EMAIL='DGRANT', PHONE_NUMBER='650.507.9844', HIRE_DATE='13-JAN-08', JOB_ID='SH_CLERK', SALARY=2600, COMMISSION_PCT=' - ', MANAGER_ID='124', DEPARTMENT_ID=50),
 Row(EMPLOYEE_ID=200, FIRST_NAME='Jennifer', LAST_NAME='Whalen', EMAIL='JWHALEN', PHONE_NUMBER='515.123.4444', HIRE_DATE='17-SEP-03', JOB_ID='AD_ASST', SALARY=4400, COMMISSION_PCT=' - ', MANAGER_ID='101', DEPARTMENT_ID=10)]

In [14]:
df_pyspark.show(3)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03| AD_ASST|  4400|            - |       101|           10|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
only showing top 3 rows



In [None]:
#selecting some columns
df_pyspark.select(['FIRST_NAME','JOB_ID']).show(5)

+----------+--------+
|FIRST_NAME|  JOB_ID|
+----------+--------+
|    Donald|SH_CLERK|
|   Douglas|SH_CLERK|
|  Jennifer| AD_ASST|
|   Michael|  MK_MAN|
|       Pat|  MK_REP|
+----------+--------+
only showing top 5 rows



In [None]:
df_pyspark['FIRST_NAME']

Column<'FIRST_NAME'>

In [None]:
#datatypes in spark dataframe
df_pyspark.dtypes

[('EMPLOYEE_ID', 'int'),
 ('FIRST_NAME', 'string'),
 ('LAST_NAME', 'string'),
 ('EMAIL', 'string'),
 ('PHONE_NUMBER', 'string'),
 ('HIRE_DATE', 'string'),
 ('JOB_ID', 'string'),
 ('SALARY', 'int'),
 ('COMMISSION_PCT', 'string'),
 ('MANAGER_ID', 'string'),
 ('DEPARTMENT_ID', 'int')]

In [None]:
### Adding Columns in data frame
df_pyspark=df_pyspark.withColumn('Modified_Salary',df_pyspark['Salary']*2)
df_pyspark.select(['Salary','Modified_Salary']).show(5)

+------+---------------+
|Salary|Modified_Salary|
+------+---------------+
|  2600|           5200|
|  2600|           5200|
|  4400|           8800|
| 13000|          26000|
|  6000|          12000|
+------+---------------+
only showing top 5 rows



In [None]:
### Drop the columns
df_pyspark=df_pyspark.drop('Modified_Salary')
df_pyspark.show(5)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03| AD_ASST|  4400|            - |       101|           10|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|  MK_MAN| 13000|            - |       100|           20|
|        202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|  MK_REP|  6000|            - |       201|           20|
+-----------+---

In [None]:
### Rename the columns
df_pyspark.withColumnRenamed('EMPLOYEE_ID','E_ID').show()

+----+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|E_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+----+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
| 198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|
| 199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|
| 200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
| 201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
| 202|       Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|
| 203|     Susan|   Mavris| SMAVRIS|515.123.7777|07-JUN-

In [None]:
from pyspark.sql import functions as f
# Add a new column with NaN values
df_with_nan = df_pyspark.withColumn("new_column", f.lit(float('nan')))
df_with_nan.show(1)

+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+----------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|       NaN|
+-----------+----------+---------+--------+------------+---------+--------+------+--------------+----------+-------------+----------+
only showing top 1 row



In [None]:
#dropping nan values
df_with_nan.na.drop().show(5)

+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+



In [None]:
### any==how
df_with_nan.na.drop(how="any").show()

+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+----------+



In [None]:
##Subset
df_with_nan.na.drop(how="any",subset=['FIRST_NAME']).show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|       NaN|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|       NaN|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|       NaN|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|       NaN|
|        202|       Pat|      Fay|    PFAY|603.1

In [None]:
### Filling the Missing Value
df_with_nan.na.fill(0,['new_column']).show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|       0.0|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|       0.0|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|       0.0|
|        201|   Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|       0.0|
|        202|       Pat|      Fay|    PFAY|603.1

In [None]:
df_with_nan=df_with_nan.na.fill(0,['new_column'])
#null value imputer
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['new_column', 'SALARY'],
    outputCols=["{}_imputed".format(c) for c in ['new_column', 'SALARY']]
    ).setStrategy("median")
# Add imputation cols to df
imputer.fit(df_with_nan).transform(df_with_nan).show()

+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+------------------+--------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|new_column_imputed|SALARY_imputed|
+-----------+----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+----------+------------------+--------------+
|        198|    Donald| OConnell|DOCONNEL|650.507.9833|21-JUN-07|  SH_CLERK|  2600|            - |       124|           50|       0.0|               0.0|          2600|
|        199|   Douglas|    Grant|  DGRANT|650.507.9844|13-JAN-08|  SH_CLERK|  2600|            - |       124|           50|       0.0|               0.0|          2600|
|        200|  Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|       0.0|               

Filter Operations

In [None]:
### Salary of the people less than or equal to 3000
df_with_nan.filter("salary<=3000").show()

+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+----------+
|EMPLOYEE_ID|FIRST_NAME|  LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|new_column|
+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+----------+
|        198|    Donald|   OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|       0.0|
|        199|   Douglas|      Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|       0.0|
|        116|    Shelli|      Baida|  SBAIDA|515.127.4563|24-DEC-05|PU_CLERK|  2900|            - |       114|           30|       0.0|
|        117|     Sigal|     Tobias| STOBIAS|515.127.4564|24-JUL-05|PU_CLERK|  2800|            - |       114|           30|       0.0|
|        118|       Guy|     Himuro| GHIMURO|515

In [None]:
df_pyspark.filter((df_pyspark['Salary']<=3000) |
                  (df_pyspark['Salary']>=20000)).show()

+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|  LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|  JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+-----------+--------+------------+---------+--------+------+--------------+----------+-------------+
|        198|    Donald|   OConnell|DOCONNEL|650.507.9833|21-JUN-07|SH_CLERK|  2600|            - |       124|           50|
|        199|   Douglas|      Grant|  DGRANT|650.507.9844|13-JAN-08|SH_CLERK|  2600|            - |       124|           50|
|        100|    Steven|       King|   SKING|515.123.4567|17-JUN-03| AD_PRES| 24000|            - |        - |           90|
|        116|    Shelli|      Baida|  SBAIDA|515.127.4563|24-DEC-05|PU_CLERK|  2900|            - |       114|           30|
|        117|     Sigal|     Tobias| STOBIAS|515.127.4564|24-JUL-05|PU_CLERK|  2800|            - |       114|           30|


In [None]:
#Not condition
df_pyspark.filter(~(df_pyspark['Salary']<=3000)).show()

+-----------+-----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|EMPLOYEE_ID| FIRST_NAME|LAST_NAME|   EMAIL|PHONE_NUMBER|HIRE_DATE|    JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+-----------+---------+--------+------------+---------+----------+------+--------------+----------+-------------+
|        200|   Jennifer|   Whalen| JWHALEN|515.123.4444|17-SEP-03|   AD_ASST|  4400|            - |       101|           10|
|        201|    Michael|Hartstein|MHARTSTE|515.123.5555|17-FEB-04|    MK_MAN| 13000|            - |       100|           20|
|        202|        Pat|      Fay|    PFAY|603.123.6666|17-AUG-05|    MK_REP|  6000|            - |       201|           20|
|        203|      Susan|   Mavris| SMAVRIS|515.123.7777|07-JUN-02|    HR_REP|  6500|            - |       101|           40|
|        204|    Hermann|     Baer|   HBAER|515.123.8888|07-JUN-02|    PR_REP| 10000|            - |       101|       

GroupBY

In [None]:
## Groupby
### Grouped to find the maximum salary
df_pyspark.groupBy('JOB_ID').sum().show()

+----------+----------------+-----------+------------------+
|    JOB_ID|sum(EMPLOYEE_ID)|sum(SALARY)|sum(DEPARTMENT_ID)|
+----------+----------------+-----------+------------------+
|FI_ACCOUNT|             555|      39600|               500|
|    MK_MAN|             201|      13000|                20|
|   IT_PROG|             525|      28800|               300|
|    FI_MGR|             108|      12008|               100|
|AC_ACCOUNT|             206|       8300|               110|
|    HR_REP|             203|       6500|                40|
|  PU_CLERK|             585|      13900|               150|
|    AC_MGR|             205|      12008|               110|
|    PR_REP|             204|      10000|                70|
|    ST_MAN|             610|      36400|               250|
|    MK_REP|             202|       6000|                20|
|    PU_MAN|             114|      11000|                30|
|  SH_CLERK|             397|       5200|               100|
|   AD_PRES|            

In [None]:
df_pyspark.groupBy('JOB_ID').agg({'Salary':'sum'}).show()

+----------+-----------+
|    JOB_ID|sum(Salary)|
+----------+-----------+
|FI_ACCOUNT|      39600|
|    MK_MAN|      13000|
|   IT_PROG|      28800|
|    FI_MGR|      12008|
|AC_ACCOUNT|       8300|
|    HR_REP|       6500|
|  PU_CLERK|      13900|
|    AC_MGR|      12008|
|    PR_REP|      10000|
|    ST_MAN|      36400|
|    MK_REP|       6000|
|    PU_MAN|      11000|
|  SH_CLERK|       5200|
|   AD_PRES|      24000|
|   AD_ASST|       4400|
|  ST_CLERK|      44000|
|     AD_VP|      34000|
+----------+-----------+



In [None]:
#Find count for selected columns
from pyspark.sql.functions import col,isnan,when,count
df_Columns=df_pyspark.columns
df_pyspark.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_Columns]
   ).show()

+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+
|EMPLOYEE_ID|FIRST_NAME|LAST_NAME|EMAIL|PHONE_NUMBER|HIRE_DATE|JOB_ID|SALARY|COMMISSION_PCT|MANAGER_ID|DEPARTMENT_ID|
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+
|          0|         0|        0|    0|           0|        0|     0|     0|             0|         0|            0|
+-----------+----------+---------+-----+------------+---------+------+------+--------------+----------+-------------+



#DataFrame to rdd

In [16]:
# Sample DataFrame
data = [("Alice", 29), ("Bob", 34), ("Cathy", 28)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)
# Convert DataFrame to RDD
rdd = df.rdd
print(rdd.collect())
# Filter the RDD to include only people older than 30
filtered_rdd = rdd.filter(lambda row: row["Age"] > 30)

# Collect the results
result = filtered_rdd.collect()
print(result)
# Convert the filtered RDD back to DataFrame
filtered_df = filtered_rdd.toDF()

# Show the DataFrame
filtered_df.show()

[Row(Name='Alice', Age=29), Row(Name='Bob', Age=34), Row(Name='Cathy', Age=28)]
[Row(Name='Bob', Age=34)]
+----+---+
|Name|Age|
+----+---+
| Bob| 34|
+----+---+



In [42]:
spark.stop()