####  We can get values from a pyspark columns using below way:

    1. Using df.rdd.collect() or df.collect() and looping for each element in the list containing pyspark.sql.types.Row element
    2. Using df.rdd.map(lambda x: x[3]).collect(). Here we would get list of original data type as the values.
    3. Using df.select().collect(). This is same as first step only diff is we are extracting specific columns.
    4. Using Pandas


In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[1]") \
                    .appName('SparkByExamples.com') \
                    .getOrCreate()

In [11]:
data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
    ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
  ]

schemas=["firstname","lastname","country","state"]

df=spark.createDataFrame(data=data,schema=schemas)
df.show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



In [66]:
rdd1 = df.rdd
print(rdd1)
print(type(rdd1))

MapPartitionsRDD[38] at javaToPython at NativeMethodAccessorImpl.java:0
<class 'pyspark.rdd.RDD'>


In [67]:
rdd1.collect()

[Row(firstname='James', lastname='Smith', country='USA', state='CA'),
 Row(firstname='Michael', lastname='Rose', country='USA', state='NY'),
 Row(firstname='Robert', lastname='Williams', country='USA', state='CA'),
 Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]

In [64]:
### Using toDF() we can again convert RDD to Dataframe

df1 = df.rdd.toDF()
df1.show()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



1. To Get the data from RDD we use take(), collect()
2. Collect after RDD makes it list

In [62]:
## When we use collect() after rdd it become List
print(df.collect())
print(df.rdd.collect())
print(type(df.collect()))

[Row(firstname='James', lastname='Smith', country='USA', state='CA'), Row(firstname='Michael', lastname='Rose', country='USA', state='NY'), Row(firstname='Robert', lastname='Williams', country='USA', state='CA'), Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]
[Row(firstname='James', lastname='Smith', country='USA', state='CA'), Row(firstname='Michael', lastname='Rose', country='USA', state='NY'), Row(firstname='Robert', lastname='Williams', country='USA', state='CA'), Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]
<class 'list'>


In [63]:
## Get Value from Dataframe using RDD
for elem in df.collect():
    print(elem)
    print(type(elem))
    print(f"firstname: {elem[0]} ,lastname: {elem[1]} , country: {elem[2]}, state: {elem[3]}")

Row(firstname='James', lastname='Smith', country='USA', state='CA')
<class 'pyspark.sql.types.Row'>
firstname: James ,lastname: Smith , country: USA, state: CA
Row(firstname='Michael', lastname='Rose', country='USA', state='NY')
<class 'pyspark.sql.types.Row'>
firstname: Michael ,lastname: Rose , country: USA, state: NY
Row(firstname='Robert', lastname='Williams', country='USA', state='CA')
<class 'pyspark.sql.types.Row'>
firstname: Robert ,lastname: Williams , country: USA, state: CA
Row(firstname='Maria', lastname='Jones', country='USA', state='FL')
<class 'pyspark.sql.types.Row'>
firstname: Maria ,lastname: Jones , country: USA, state: FL


In [36]:
print(df.collect()[0])
print(df.collect()[1])
print(df.collect()[2])
print(df.collect()[3])
print(df.collect()[0][1])


Row(firstname='James', lastname='Smith', country='USA', state='CA')
Row(firstname='Michael', lastname='Rose', country='USA', state='NY')
Row(firstname='Robert', lastname='Williams', country='USA', state='CA')
Row(firstname='Maria', lastname='Jones', country='USA', state='FL')
Smith


In [37]:
## Another way to get the data from Dataframe using RDD

for i in range(len(df.collect())):
    for j in range(len(df.collect()[i])):
        print(df.collect()[i][j])
    

James
Smith
USA
CA
Michael
Rose
USA
NY
Robert
Williams
USA
CA
Maria
Jones
USA
FL


In [58]:
## Using Lambda get data from RDD. This way we can get data from spark dataframe column

states_list = df.rdd.map(lambda x: x[3]).collect()
print(states_list)
states_list = df.rdd.map(lambda x: x["state"]).collect() ## This generate list containing str or int data
print(states_list)

for elem in states_list:
    print(elem)

['CA', 'NY', 'CA', 'FL']
['CA', 'NY', 'CA', 'FL']
CA
NY
CA
FL


In [56]:
# from collections import OrderedDict 
# res = list(OrderedDict.fromkeys(states_list)) 
# print(res)

In [57]:

## ANother way to get data from dataframe column
states3=df.select(df.state).collect()  ## This generate list containing row type data
print(states3)
for elem in states3:
    print(type(elem))
    print(elem[0])

[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')]
<class 'pyspark.sql.types.Row'>
CA
<class 'pyspark.sql.types.Row'>
NY
<class 'pyspark.sql.types.Row'>
CA
<class 'pyspark.sql.types.Row'>
FL


So using map operation we can convert row type data to normal datatype

In [60]:
states4=df.select(df.state,df.firstname).collect()  ## This generate list containing row type data
print(states4)

[Row(state='CA', firstname='James'), Row(state='NY', firstname='Michael'), Row(state='CA', firstname='Robert'), Row(state='FL', firstname='Maria')]


In [61]:
pandDF=df.select(df.state,df.firstname).toPandas()
print(list(pandDF['state']))
print(list(pandDF['firstname']))

['CA', 'NY', 'CA', 'FL']
['James', 'Michael', 'Robert', 'Maria']
