In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('SparkAppExample').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv("data/data.csv")

In [5]:
df.show()

+---+--------+----------+------------+------+
|_c0|     _c1|       _c2|         _c3|   _c4|
+---+--------+----------+------------+------+
| id|    name|birth_date|  occupation|gender|
|  1|    John|  05/23/85|      Doctor|     M|
|  2|    Mary|  09/12/92|    Engineer|     F|
|  3|     Joe|  02/08/78|      Lawyer|     M|
|  4|    Anna|  07/31/89|  Programmer|     F|
|  5|    Paul|  11/17/96|     Student|     M|
|  6|  Renata|  04/02/81|       Nurse|     F|
|  7|    Luke|  08/19/93|   Architect|     M|
|  8|Fernanda|  12/27/75|Psychologist|     F|
|  9|     Gus|  03/14/90|Veterinarian|     M|
| 10|   Julie|  06/21/82|  Programmer|     F|
| 11| Raphael|  10/09/95|     Student|     M|
| 12|   Carol|  01/25/87|  Journalist|     F|
| 13| Matthew|  05/07/79|Entrepreneur|     M|
| 14|   Bruna|  08/01/94|    Designer|     F|
| 15|   Fabio|  12/18/86|  Pharmacist|     M|
| 16| Camille|  02/06/80|     Teacher|     F|
| 17|  Andrew|  06/22/91|    Engineer|     M|
| 18| Maryann|  11/13/83|Psychiatr

In [6]:
# This one is for displaying the columns as it is
df = spark.read.option("header", "true").csv("data/data.csv", inferSchema=True)
df.show()

+---+--------+----------+------------+------+
| id|    name|birth_date|  occupation|gender|
+---+--------+----------+------------+------+
|  1|    John|  05/23/85|      Doctor|     M|
|  2|    Mary|  09/12/92|    Engineer|     F|
|  3|     Joe|  02/08/78|      Lawyer|     M|
|  4|    Anna|  07/31/89|  Programmer|     F|
|  5|    Paul|  11/17/96|     Student|     M|
|  6|  Renata|  04/02/81|       Nurse|     F|
|  7|    Luke|  08/19/93|   Architect|     M|
|  8|Fernanda|  12/27/75|Psychologist|     F|
|  9|     Gus|  03/14/90|Veterinarian|     M|
| 10|   Julie|  06/21/82|  Programmer|     F|
| 11| Raphael|  10/09/95|     Student|     M|
| 12|   Carol|  01/25/87|  Journalist|     F|
| 13| Matthew|  05/07/79|Entrepreneur|     M|
| 14|   Bruna|  08/01/94|    Designer|     F|
| 15|   Fabio|  12/18/86|  Pharmacist|     M|
| 16| Camille|  02/06/80|     Teacher|     F|
| 17|  Andrew|  06/22/91|    Engineer|     M|
| 18| Maryann|  11/13/83|Psychiatrist|     F|
| 19|    Pete|  03/30/98|     Stud

In [7]:
# This one is because by default all the values considered as string
# We can add "inferSchema=True" to reading csv file
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- gender: string (nullable = true)



In [8]:
columns = df.columns
columns

['id', 'name', 'birth_date', 'occupation', 'gender']

In [9]:
df.head(3)

[Row(id=1, name='John', birth_date='05/23/85', occupation='Doctor', gender='M'),
 Row(id=2, name='Mary', birth_date='09/12/92', occupation='Engineer', gender='F'),
 Row(id=3, name='Joe', birth_date='02/08/78', occupation='Lawyer', gender='M')]

In [10]:
# Select a column
df.show()

+---+--------+----------+------------+------+
| id|    name|birth_date|  occupation|gender|
+---+--------+----------+------------+------+
|  1|    John|  05/23/85|      Doctor|     M|
|  2|    Mary|  09/12/92|    Engineer|     F|
|  3|     Joe|  02/08/78|      Lawyer|     M|
|  4|    Anna|  07/31/89|  Programmer|     F|
|  5|    Paul|  11/17/96|     Student|     M|
|  6|  Renata|  04/02/81|       Nurse|     F|
|  7|    Luke|  08/19/93|   Architect|     M|
|  8|Fernanda|  12/27/75|Psychologist|     F|
|  9|     Gus|  03/14/90|Veterinarian|     M|
| 10|   Julie|  06/21/82|  Programmer|     F|
| 11| Raphael|  10/09/95|     Student|     M|
| 12|   Carol|  01/25/87|  Journalist|     F|
| 13| Matthew|  05/07/79|Entrepreneur|     M|
| 14|   Bruna|  08/01/94|    Designer|     F|
| 15|   Fabio|  12/18/86|  Pharmacist|     M|
| 16| Camille|  02/06/80|     Teacher|     F|
| 17|  Andrew|  06/22/91|    Engineer|     M|
| 18| Maryann|  11/13/83|Psychiatrist|     F|
| 19|    Pete|  03/30/98|     Stud

In [11]:
multi_columns = df.select(['name', 'occupation'])
multi_columns.show()

+--------+------------+
|    name|  occupation|
+--------+------------+
|    John|      Doctor|
|    Mary|    Engineer|
|     Joe|      Lawyer|
|    Anna|  Programmer|
|    Paul|     Student|
|  Renata|       Nurse|
|    Luke|   Architect|
|Fernanda|Psychologist|
|     Gus|Veterinarian|
|   Julie|  Programmer|
| Raphael|     Student|
|   Carol|  Journalist|
| Matthew|Entrepreneur|
|   Bruna|    Designer|
|   Fabio|  Pharmacist|
| Camille|     Teacher|
|  Andrew|    Engineer|
| Maryann|Psychiatrist|
|    Pete|     Student|
|   Luana|      Doctor|
+--------+------------+



In [12]:
# Describe the dataframe
df.describe().show()

+-------+-----------------+------+----------+------------+------+
|summary|               id|  name|birth_date|  occupation|gender|
+-------+-----------------+------+----------+------------+------+
|  count|               20|    20|        20|          20|    20|
|   mean|             10.5|  NULL|      NULL|        NULL|  NULL|
| stddev|5.916079783099616|  NULL|      NULL|        NULL|  NULL|
|    min|                1|Andrew|  01/25/87|   Architect|     F|
|    max|               20|Renata|  12/27/75|Veterinarian|     M|
+-------+-----------------+------+----------+------------+------+



In [13]:
# Add a column in dataframe
from pyspark.sql.functions import rand
df = df.withColumn("Experience", (rand() * 100).cast("int"))
df.show()

+---+--------+----------+------------+------+----------+
| id|    name|birth_date|  occupation|gender|Experience|
+---+--------+----------+------------+------+----------+
|  1|    John|  05/23/85|      Doctor|     M|        86|
|  2|    Mary|  09/12/92|    Engineer|     F|        72|
|  3|     Joe|  02/08/78|      Lawyer|     M|        53|
|  4|    Anna|  07/31/89|  Programmer|     F|        61|
|  5|    Paul|  11/17/96|     Student|     M|        17|
|  6|  Renata|  04/02/81|       Nurse|     F|        63|
|  7|    Luke|  08/19/93|   Architect|     M|        42|
|  8|Fernanda|  12/27/75|Psychologist|     F|        58|
|  9|     Gus|  03/14/90|Veterinarian|     M|        50|
| 10|   Julie|  06/21/82|  Programmer|     F|        64|
| 11| Raphael|  10/09/95|     Student|     M|         2|
| 12|   Carol|  01/25/87|  Journalist|     F|        59|
| 13| Matthew|  05/07/79|Entrepreneur|     M|        63|
| 14|   Bruna|  08/01/94|    Designer|     F|        51|
| 15|   Fabio|  12/18/86|  Phar

In [14]:
df = df.drop("new Column")
df.show()

+---+--------+----------+------------+------+----------+
| id|    name|birth_date|  occupation|gender|Experience|
+---+--------+----------+------------+------+----------+
|  1|    John|  05/23/85|      Doctor|     M|        86|
|  2|    Mary|  09/12/92|    Engineer|     F|        72|
|  3|     Joe|  02/08/78|      Lawyer|     M|        53|
|  4|    Anna|  07/31/89|  Programmer|     F|        61|
|  5|    Paul|  11/17/96|     Student|     M|        17|
|  6|  Renata|  04/02/81|       Nurse|     F|        63|
|  7|    Luke|  08/19/93|   Architect|     M|        42|
|  8|Fernanda|  12/27/75|Psychologist|     F|        58|
|  9|     Gus|  03/14/90|Veterinarian|     M|        50|
| 10|   Julie|  06/21/82|  Programmer|     F|        64|
| 11| Raphael|  10/09/95|     Student|     M|         2|
| 12|   Carol|  01/25/87|  Journalist|     F|        59|
| 13| Matthew|  05/07/79|Entrepreneur|     M|        63|
| 14|   Bruna|  08/01/94|    Designer|     F|        51|
| 15|   Fabio|  12/18/86|  Phar

In [15]:
# Add a value to a specific column
df = df.withColumn("Experience", df['Experience'] + 2)
# df = df.drop("After 2 Years")
df.show()

+---+--------+----------+------------+------+----------+
| id|    name|birth_date|  occupation|gender|Experience|
+---+--------+----------+------------+------+----------+
|  1|    John|  05/23/85|      Doctor|     M|        88|
|  2|    Mary|  09/12/92|    Engineer|     F|        74|
|  3|     Joe|  02/08/78|      Lawyer|     M|        55|
|  4|    Anna|  07/31/89|  Programmer|     F|        63|
|  5|    Paul|  11/17/96|     Student|     M|        19|
|  6|  Renata|  04/02/81|       Nurse|     F|        65|
|  7|    Luke|  08/19/93|   Architect|     M|        44|
|  8|Fernanda|  12/27/75|Psychologist|     F|        60|
|  9|     Gus|  03/14/90|Veterinarian|     M|        52|
| 10|   Julie|  06/21/82|  Programmer|     F|        66|
| 11| Raphael|  10/09/95|     Student|     M|         4|
| 12|   Carol|  01/25/87|  Journalist|     F|        61|
| 13| Matthew|  05/07/79|Entrepreneur|     M|        65|
| 14|   Bruna|  08/01/94|    Designer|     F|        53|
| 15|   Fabio|  12/18/86|  Phar

In [16]:
# for rename one column use: df.withColumnRenamed(oldcolumn, newColumn)
# For rename multi columns use this method
columns_to_rename = {"name": "FirstName", "birth_date": "Birthday", "occupation": "Occupation", "gender":"Gender"}

for oldname, newName in columns_to_rename.items():
    df = df.withColumnRenamed(oldname, newName)

df.show()

+---+---------+--------+------------+------+----------+
| id|FirstName|Birthday|  Occupation|Gender|Experience|
+---+---------+--------+------------+------+----------+
|  1|     John|05/23/85|      Doctor|     M|        88|
|  2|     Mary|09/12/92|    Engineer|     F|        74|
|  3|      Joe|02/08/78|      Lawyer|     M|        55|
|  4|     Anna|07/31/89|  Programmer|     F|        63|
|  5|     Paul|11/17/96|     Student|     M|        19|
|  6|   Renata|04/02/81|       Nurse|     F|        65|
|  7|     Luke|08/19/93|   Architect|     M|        44|
|  8| Fernanda|12/27/75|Psychologist|     F|        60|
|  9|      Gus|03/14/90|Veterinarian|     M|        52|
| 10|    Julie|06/21/82|  Programmer|     F|        66|
| 11|  Raphael|10/09/95|     Student|     M|         4|
| 12|    Carol|01/25/87|  Journalist|     F|        61|
| 13|  Matthew|05/07/79|Entrepreneur|     M|        65|
| 14|    Bruna|08/01/94|    Designer|     F|        53|
| 15|    Fabio|12/18/86|  Pharmacist|     M|    