In [3]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

In [4]:
from pyspark.sql import Row

spark = SparkSession \
    .builder \
    .appName("sample") \
    .master("local[*]") \
    .getOrCreate()

sc = spark.sparkContext

In [17]:
# Load a text file and convert each line to a Row.
lines = sc.textFile("C:/spark/spark-2.4.7-bin-hadoop2.7/examples/src/main/resources/people.txt")
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

# Infer the schema, and register the DataFrame as a table.
schemaPeople = spark.createDataFrame(people)
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
teenagers = spark.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
teenNames = teenagers.rdd.map(lambda p: "Name: " + p.name).collect()
for name in teenNames:
    print(name)
# Name: Justin

Name: Justin


In [11]:
lines.collect()

['Michael, 29', 'Andy, 30', 'Justin, 19']

In [12]:
parts.collect()


[['Michael', ' 29'], ['Andy', ' 30'], ['Justin', ' 19']]

In [18]:
people.collect()


[Row(age=29, name='Michael'),
 Row(age=30, name='Andy'),
 Row(age=19, name='Justin')]

In [25]:
print(type(lines),type(parts),type(people),type(schemaPeople),type(teenagers),type(teenNames))

<class 'pyspark.rdd.RDD'> <class 'pyspark.rdd.PipelinedRDD'> <class 'pyspark.rdd.PipelinedRDD'> <class 'pyspark.sql.dataframe.DataFrame'> <class 'pyspark.sql.dataframe.DataFrame'> <class 'list'>


In [21]:
spark.sql("SHOW TABLES").show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |   people|       true|
+--------+---------+-----------+



In [23]:
teenagers.show()

+------+
|  name|
+------+
|Justin|
+------+

