Spark initialization

In [1]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession \
    .builder \
    .getOrCreate()

Creating Dataframe from a RDD

In [3]:
sample_page_views  = spark.sparkContext.parallelize([
    ["en", "Statue_of_Liberty", "2022-01-01", 263],
    ["en", "Replicas_of_the_Statue_of_Liberty", "2022-01-01", 11],
    ["en", "Statue_of_Lucille_Ball" ,"2022-01-01", 6],
    ["en", "Statue_of_Liberty_National_Monument", "2022-01-01", 4],
    ["en", "Statue_of_Liberty_play"  ,"2022-01-01", 3],  
])


In [4]:
sample_page_views.collect()

[['en', 'Statue_of_Liberty', '2022-01-01', 263],
 ['en', 'Replicas_of_the_Statue_of_Liberty', '2022-01-01', 11],
 ['en', 'Statue_of_Lucille_Ball', '2022-01-01', 6],
 ['en', 'Statue_of_Liberty_National_Monument', '2022-01-01', 4],
 ['en', 'Statue_of_Liberty_play', '2022-01-01', 3]]

In [5]:
sample_page_views_df = sample_page_views.toDF(['Language_code','title','date','count'])

In [6]:
sample_page_views_df.show(2)

+-------------+--------------------+----------+-----+
|Language_code|               title|      date|count|
+-------------+--------------------+----------+-----+
|           en|   Statue_of_Liberty|2022-01-01|  263|
|           en|Replicas_of_the_S...|2022-01-01|   11|
+-------------+--------------------+----------+-----+
only showing top 2 rows



In [7]:
sample_page_views_df.show(5, truncate=False)

+-------------+-----------------------------------+----------+-----+
|Language_code|title                              |date      |count|
+-------------+-----------------------------------+----------+-----+
|en           |Statue_of_Liberty                  |2022-01-01|263  |
|en           |Replicas_of_the_Statue_of_Liberty  |2022-01-01|11   |
|en           |Statue_of_Lucille_Ball             |2022-01-01|6    |
|en           |Statue_of_Liberty_National_Monument|2022-01-01|4    |
|en           |Statue_of_Liberty_play             |2022-01-01|3    |
+-------------+-----------------------------------+----------+-----+



Access the RDD underlying sample_page_views_df

In [11]:
sample_page_views_rdd_restored = sample_page_views_df.rdd
sample_page_views_rdd_restored.collect()

[Row(Language_code='en', title='Statue_of_Liberty', date='2022-01-01', count=263),
 Row(Language_code='en', title='Replicas_of_the_Statue_of_Liberty', date='2022-01-01', count=11),
 Row(Language_code='en', title='Statue_of_Lucille_Ball', date='2022-01-01', count=6),
 Row(Language_code='en', title='Statue_of_Liberty_National_Monument', date='2022-01-01', count=4),
 Row(Language_code='en', title='Statue_of_Liberty_play', date='2022-01-01', count=3)]

In [12]:
type(sample_page_views_df)

pyspark.sql.dataframe.DataFrame

In [13]:
type(sample_page_views_rdd_restored)

pyspark.rdd.RDD

In [27]:
sample_page_views_df.printSchema()

root
 |-- Language_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- date: string (nullable = true)
 |-- count: long (nullable = true)



Create dataframe from external source

In [15]:
people_df = spark.read.csv('/user/root/people.csv')

In [16]:
people_df.show(2)

+---+---------------+-------+-------+------+--------------------+--------------------+----------+---------------+
|_c0|            _c1|    _c2|    _c3|   _c4|                 _c5|                 _c6|       _c7|            _c8|
+---+---------------+-------+-------+------+--------------------+--------------------+----------+---------------+
|  1|88F7B33d2bcf9f5| Shelby|Terrell|  Male|elijah57@example.net|001-084-906-7849x...|1945-10-26|Games developer|
|  2|f90cD3E76f1A9b9|Phillip|Summers|Female|bethany14@example...|   214.112.6044x4913|1910-03-24| Phytotherapist|
+---+---------------+-------+-------+------+--------------------+--------------------+----------+---------------+
only showing top 2 rows



In [23]:
people_df = people_df.toDF('Index','User Id','First Name','Last Name','Sex','Email','Phone','Date of birth','Job Title'
)
people_df.show(2,truncate=False)

+-----+---------------+----------+---------+------+---------------------+----------------------+-------------+---------------+
|Index|User Id        |First Name|Last Name|Sex   |Email                |Phone                 |Date of birth|Job Title      |
+-----+---------------+----------+---------+------+---------------------+----------------------+-------------+---------------+
|1    |88F7B33d2bcf9f5|Shelby    |Terrell  |Male  |elijah57@example.net |001-084-906-7849x73518|1945-10-26   |Games developer|
|2    |f90cD3E76f1A9b9|Phillip   |Summers  |Female|bethany14@example.com|214.112.6044x4913     |1910-03-24   |Phytotherapist |
+-----+---------------+----------+---------+------+---------------------+----------------------+-------------+---------------+
only showing top 2 rows



In [25]:
people_df.printSchema()

root
 |-- Index: string (nullable = true)
 |-- User Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Date of birth: string (nullable = true)
 |-- Job Title: string (nullable = true)



In [31]:
people_df = spark.read.csv('/user/root/people.csv',inferSchema=True) #header = True if we have first row as header
people_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: timestamp (nullable = true)
 |-- _c8: string (nullable = true)



In [34]:
people_df = spark.read.option('inferSchema',True).csv('/user/root/people.csv')
people_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: timestamp (nullable = true)
 |-- _c8: string (nullable = true)



In [36]:
people_df.dtypes

[('_c0', 'int'),
 ('_c1', 'string'),
 ('_c2', 'string'),
 ('_c3', 'string'),
 ('_c4', 'string'),
 ('_c5', 'string'),
 ('_c6', 'string'),
 ('_c7', 'timestamp'),
 ('_c8', 'string')]

Tranformations

In [37]:
people_df.describe()

DataFrame[summary: string, _c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c8: string]

In [39]:
people_df = people_df.toDF('Index','User Id','First Name','Last Name','Sex','Email','Phone','Date of birth','Job Title'
)
people_df.show(2)

+-----+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
|Index|        User Id|First Name|Last Name|   Sex|               Email|               Phone|      Date of birth|      Job Title|
+-----+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|  Male|elijah57@example.net|001-084-906-7849x...|1945-10-26 00:00:00|Games developer|
|    2|f90cD3E76f1A9b9|   Phillip|  Summers|Female|bethany14@example...|   214.112.6044x4913|1910-03-24 00:00:00| Phytotherapist|
+-----+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
only showing top 2 rows



Create a DF from another DF

In [41]:
people_name_df = people_df['First Name','Last Name']
people_name_df.show()

+----------+---------+
|First Name|Last Name|
+----------+---------+
|    Shelby|  Terrell|
|   Phillip|  Summers|
|  Kristine|   Travis|
|   Yesenia| Martinez|
|      Lori|     Todd|
|      Erin|      Day|
| Katherine|     Buck|
|   Ricardo|   Hinton|
|      Dave|  Farrell|
|    Isaiah|    Downs|
|    Sheila|     Ross|
|     Stacy|   Newton|
|     Mandy|    Blake|
|   Bridget|     Nash|
|   Crystal|   Farmer|
|    Thomas|   Knight|
|   Maurice|   Rangel|
|     Frank|  Meadows|
|     Alvin|     Paul|
|     Jared| Mitchell|
+----------+---------+
only showing top 20 rows



In [43]:
without_index = people_df.drop('Index')
without_index.show(2)

+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
|        User Id|First Name|Last Name|   Sex|               Email|               Phone|      Date of birth|      Job Title|
+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
|88F7B33d2bcf9f5|    Shelby|  Terrell|  Male|elijah57@example.net|001-084-906-7849x...|1945-10-26 00:00:00|Games developer|
|f90cD3E76f1A9b9|   Phillip|  Summers|Female|bethany14@example...|   214.112.6044x4913|1910-03-24 00:00:00| Phytotherapist|
+---------------+----------+---------+------+--------------------+--------------------+-------------------+---------------+
only showing top 2 rows



withColumnRenamed

In [44]:
without_index = without_index.withColumnRenamed('User Id','Id')
without_index.show(5)


+---------------+----------+---------+------+--------------------+--------------------+-------------------+------------------+
|             Id|First Name|Last Name|   Sex|               Email|               Phone|      Date of birth|         Job Title|
+---------------+----------+---------+------+--------------------+--------------------+-------------------+------------------+
|88F7B33d2bcf9f5|    Shelby|  Terrell|  Male|elijah57@example.net|001-084-906-7849x...|1945-10-26 00:00:00|   Games developer|
|f90cD3E76f1A9b9|   Phillip|  Summers|Female|bethany14@example...|   214.112.6044x4913|1910-03-24 00:00:00|    Phytotherapist|
|DbeAb8CcdfeFC2c|  Kristine|   Travis|  Male|bthompson@example...|        277.609.7938|1992-07-02 00:00:00|         Homeopath|
|A31Bee3c201ef58|   Yesenia| Martinez|  Male|kaitlinkaiser@exa...|        584.094.6111|2017-08-03 00:00:00| Market researcher|
|1bA7A3dc874da3c|      Lori|     Todd|  Male|buchananmanuel@ex...|   689-207-3558x7233|1938-12-01 00:00:00|Vete

filter

In [49]:
male = without_index.filter(without_index.Sex == 'Male')
male.count()

47

In [50]:
female = without_index.filter(without_index.Sex == 'Female')
female.count()

53

select

In [51]:
female.select('First Name','Last Name').show()

+----------+---------+
|First Name|Last Name|
+----------+---------+
|   Phillip|  Summers|
| Katherine|     Buck|
|    Sheila|     Ross|
|   Bridget|     Nash|
|    Thomas|   Knight|
|     Jared| Mitchell|
|Jacqueline|   Norton|
|   Colleen| Hatfield|
|    Janice|   Rhodes|
|    Alfred|   Mcneil|
|  Brittney|     Vega|
|    Isaiah|  Camacho|
|    Bonnie|  Andrews|
|   Brandon|  Schmidt|
|   Jackson|   Sparks|
|      Gene|     Rich|
|   Cynthia|  Wiggins|
|     Tanya| Mckinney|
|   Matthew|    Stone|
|      Kirk|    Walsh|
+----------+---------+
only showing top 20 rows



groupBy

In [55]:
without_index.groupBy('Sex').count().show()

+------+-----+
|   Sex|count|
+------+-----+
|Female|   53|
|  Male|   47|
+------+-----+



In [59]:
without_index.groupBy('Sex').count().orderBy('count',ascending=True).show()

+------+-----+
|   Sex|count|
+------+-----+
|  Male|   47|
|Female|   53|
+------+-----+



Before querying a DataFrame with SQL in Spark, it must be saved to the SparkSession’s catalog. 
The following code saves the DataFrame as a local temporary view in memory. 
As long as the current SparkSession is active, we can use SparkSession.sql() to query it

createOrReplaceTempView

In [60]:
people_df.createOrReplaceTempView('people')

In [62]:
query = 'select * from people'
spark.sql(query).show(2,truncate=False)

+-----+---------------+----------+---------+------+---------------------+----------------------+-------------------+---------------+
|Index|User Id        |First Name|Last Name|Sex   |Email                |Phone                 |Date of birth      |Job Title      |
+-----+---------------+----------+---------+------+---------------------+----------------------+-------------------+---------------+
|1    |88F7B33d2bcf9f5|Shelby    |Terrell  |Male  |elijah57@example.net |001-084-906-7849x73518|1945-10-26 00:00:00|Games developer|
|2    |f90cD3E76f1A9b9|Phillip   |Summers  |Female|bethany14@example.com|214.112.6044x4913     |1910-03-24 00:00:00|Phytotherapist |
+-----+---------------+----------+---------+------+---------------------+----------------------+-------------------+---------------+
only showing top 2 rows



withColumn and inbuilt functions

In [78]:
from pyspark.sql import functions as f
cleaned_df = people_df.withColumn('Date of birth',f.to_date(people_df['Date of birth']))

In [79]:
cleaned_df.printSchema()

root
 |-- Index: integer (nullable = true)
 |-- User Id: string (nullable = true)
 |-- First Name: string (nullable = true)
 |-- Last Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Email: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Date of birth: date (nullable = true)
 |-- Job Title: string (nullable = true)



In [80]:
cleaned_df.show(1)

+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+
|Index|        User Id|First Name|Last Name| Sex|               Email|               Phone|Date of birth|      Job Title|
+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|Games developer|
+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+
only showing top 1 row



In [82]:
cleaned_df = cleaned_df.withColumn('Full Name',f.concat_ws(' ',cleaned_df['First Name'],cleaned_df['Last Name']))
cleaned_df.show(1)

+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+--------------+
|Index|        User Id|First Name|Last Name| Sex|               Email|               Phone|Date of birth|      Job Title|     Full Name|
+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+--------------+
|    1|88F7B33d2bcf9f5|    Shelby|  Terrell|Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|Games developer|Shelby Terrell|
+-----+---------------+----------+---------+----+--------------------+--------------------+-------------+---------------+--------------+
only showing top 1 row



In [84]:
cleaned_df = cleaned_df['User ID','Full Name','Sex','Email','Phone','Date of birth','Job Title']
cleaned_df.show(1)

+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
|        User ID|     Full Name| Sex|               Email|               Phone|Date of birth|      Job Title|
+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
|88F7B33d2bcf9f5|Shelby Terrell|Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|Games developer|
+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
only showing top 1 row



In [85]:
cleaned_df.count()

100

Read and write

In [86]:
cleaned_df.write.csv('/output/',mode="overwrite")

In [87]:
recover = spark.read.csv('/output/')

In [88]:
recover.show(2)

+---------------+---------------+------+--------------------+--------------------+----------+---------------+
|            _c0|            _c1|   _c2|                 _c3|                 _c4|       _c5|            _c6|
+---------------+---------------+------+--------------------+--------------------+----------+---------------+
|88F7B33d2bcf9f5| Shelby Terrell|  Male|elijah57@example.net|001-084-906-7849x...|1945-10-26|Games developer|
|f90cD3E76f1A9b9|Phillip Summers|Female|bethany14@example...|   214.112.6044x4913|1910-03-24| Phytotherapist|
+---------------+---------------+------+--------------------+--------------------+----------+---------------+
only showing top 2 rows



 It looks like this file didn’t retain information about column headers or datatypes. 
Unfortunately, there’s no way for a CSV to retain information about its format. 
Each time we read it, we’ll need to tell Spark exactly how it must be processed.


Parquet format

Luckily, there is a file format called “Parquet” that’s specially designed for big data and solves this problem among many others. 
Parquet offers efficient data compression, is faster to perform analysis on than CSV, and preserves information about a dataset’s schema. 
Let’s try saving and re-reading this file to and from Parquet instead

In [89]:
cleaned_df.write.parquet('/output/',mode='overwrite') # throwing error as we cannot have space in column name in parquet format

AnalysisException: 'Attribute name "User ID" contains invalid character(s) among " ,;{}()\\n\\t=". Please use alias to rename it.;'

In [90]:
cleaned_df.show(1)

+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
|        User ID|     Full Name| Sex|               Email|               Phone|Date of birth|      Job Title|
+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
|88F7B33d2bcf9f5|Shelby Terrell|Male|elijah57@example.net|001-084-906-7849x...|   1945-10-26|Games developer|
+---------------+--------------+----+--------------------+--------------------+-------------+---------------+
only showing top 1 row



In [91]:
cleaned_df = cleaned_df.withColumnRenamed('User ID','ID').withColumnRenamed('Full Name','Full_name')

In [92]:
cleaned_df = cleaned_df.withColumnRenamed('Date of birth','DOB').withColumnRenamed('Job Title','Job_title')

In [93]:
cleaned_df.write.parquet('/output/',mode='overwrite') 