In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [6]:
data = spark.read.csv('data/users.csv', header=True)
data.show()

+-----+---+------+----------+
| name|age|gender|       dob|
+-----+---+------+----------+
|  jon| 45|  male|1997-09-18|
|sarah| 32|female|2001-12-04|
| jane| 65|female|1968-05-30|
|  jim| 70|  male|      null|
|  joe| 22|  male|2004-10-11|
|simon| 52|  male|2005-04-04|
+-----+---+------+----------+



In [8]:
# Insert or replace column: withColumn(col_name, col_data)

data.withColumn('is_person', data.name)
data.show()

+-----+---+------+----------+
| name|age|gender|       dob|
+-----+---+------+----------+
|  jon| 45|  male|1997-09-18|
|sarah| 32|female|2001-12-04|
| jane| 65|female|1968-05-30|
|  jim| 70|  male|      null|
|  joe| 22|  male|2004-10-11|
|simon| 52|  male|2005-04-04|
+-----+---+------+----------+



In [9]:
# Insert or replace column: withColumn(col_name, col_data)

data = data.withColumn('is_person', data.name)
data.show()

+-----+---+------+----------+---------+
| name|age|gender|       dob|is_person|
+-----+---+------+----------+---------+
|  jon| 45|  male|1997-09-18|      jon|
|sarah| 32|female|2001-12-04|    sarah|
| jane| 65|female|1968-05-30|     jane|
|  jim| 70|  male|      null|      jim|
|  joe| 22|  male|2004-10-11|      joe|
|simon| 52|  male|2005-04-04|    simon|
+-----+---+------+----------+---------+



In [10]:
# Rename a column: withColumnRenamed(old_col_name, new_col_name)

data = data.withColumnRenamed('is_person', 'name_2')
data.show()

+-----+---+------+----------+------+
| name|age|gender|       dob|name_2|
+-----+---+------+----------+------+
|  jon| 45|  male|1997-09-18|   jon|
|sarah| 32|female|2001-12-04| sarah|
| jane| 65|female|1968-05-30|  jane|
|  jim| 70|  male|      null|   jim|
|  joe| 22|  male|2004-10-11|   joe|
|simon| 52|  male|2005-04-04| simon|
+-----+---+------+----------+------+



In [11]:
# Delete a column: drop(col_name)

data = data.drop('name_2')
data.show()

+-----+---+------+----------+
| name|age|gender|       dob|
+-----+---+------+----------+
|  jon| 45|  male|1997-09-18|
|sarah| 32|female|2001-12-04|
| jane| 65|female|1968-05-30|
|  jim| 70|  male|      null|
|  joe| 22|  male|2004-10-11|
|simon| 52|  male|2005-04-04|
+-----+---+------+----------+



In [12]:
# Remove rows with missing values

data = data.na.drop()
data.show()

+-----+---+------+----------+
| name|age|gender|       dob|
+-----+---+------+----------+
|  jon| 45|  male|1997-09-18|
|sarah| 32|female|2001-12-04|
| jane| 65|female|1968-05-30|
|  joe| 22|  male|2004-10-11|
|simon| 52|  male|2005-04-04|
+-----+---+------+----------+



In [13]:
# Replace values with new values

data = data.replace('2004-10-11', '1990-01-01')
data.show()

+-----+---+------+----------+
| name|age|gender|       dob|
+-----+---+------+----------+
|  jon| 45|  male|1997-09-18|
|sarah| 32|female|2001-12-04|
| jane| 65|female|1968-05-30|
|  joe| 22|  male|1990-01-01|
|simon| 52|  male|2005-04-04|
+-----+---+------+----------+



In [None]:
# Select single column

data.select
(
    'name'
).show()

In [None]:
# Select multiple columns

data.select
(
    ['name', 'age']
).show()

In [None]:
# Filter with single condition

from pyspark.sql.functions import col, lit

data.filter
( 
    col('dob') >= lit('2000-01-01') 
).show()

In [None]:
# Filter with AND conditions

data.filter
( 
    (col('dob') >= lit('2000-01-01')) & 
    (col('age') <= 50)
).show()

In [None]:
# Filter with OR conditions

data.filter
( 
    (col('dob') >= lit('2000-01-01')) | 
    (col('age') <= 50)
).show()

In [None]:
# Filter for between two values

data.filter
(
    data.age.between(20, 35)
).show()

In [None]:
# Apply values conditionally (CASE WHEN)

from pyspark.sql.functions import when

data.select
(
    'name',
    'gender',
    when(data.age <= 40, 'young').otherwise('old')
).show()

In [None]:
# Filtering strings using LIKE

data.select(
    'name',
    data.name.rlike('^[j]').alias('Names starting with J')
).distinct().show()

In [None]:
# Group by

data.select([
    'gender',
    'age'
]).groupBy('gender').mean().show()

In [None]:
# Save data to csv

data.write.csv('new_data.csv')