In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 34 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 31.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845515 sha256=8cd1abc3a7a927ac1f22f2b43b4572096861c4b047cda4d90045faf10ec7e081
  Stored in directory: /root/.cache/pip/wheels/42/59/f5/79a5bf931714dcd201b26025347785f087370a10a3329a899c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully 

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark =SparkSession.builder.appName('SparkByExamples.com').getOrCreate()


In [5]:
data =[('James','','Smith','1991-04-01','M',3000),
('Michael','Rose','','2000-05-19','M',4000),
('Robert','','Williams','1978-09-05','M',4000),
('Maria','Anne','Jones','1967-12-01','F',4000)]

In [6]:
columns =["firstname","middlename","lastname","dob","gender","salary"] 

In [7]:
df=spark.createDataFrame(data,columns)

In [8]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
+---------+----------+--------+----------+------+------+



In [9]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



# 1.Change DataType using PySpark withColumn()

In [10]:
df.withColumn("salary",df["salary"].cast("Integer"))

DataFrame[firstname: string, middlename: string, lastname: string, dob: string, gender: string, salary: int]

In [11]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



# 2.	Update The Value of an Existing Column

In [12]:
df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
+---------+----------+--------+----------+------+------+



In [13]:
df1=df.withColumn("salary",df["salary"]*100)

In [14]:
df1.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|
|   Robert|          |Williams|1978-09-05|     M|400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|
+---------+----------+--------+----------+------+------+



# 3.	Create a Column from an Existing

In [15]:
df2=df1.withColumn("copiedcolumn",df1["salary"]*1)

In [16]:
df2.show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|copiedcolumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|300000|      300000|
|  Michael|      Rose|        |2000-05-19|     M|400000|      400000|
|   Robert|          |Williams|1978-09-05|     M|400000|      400000|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|      400000|
+---------+----------+--------+----------+------+------+------------+



# 4. Add a New Column using withColumn() and lit() function

In [17]:
from pyspark.sql.functions import lit

In [18]:
df3=df2.withColumn("country",lit("India"))

In [19]:
df3.show()

+---------+----------+--------+----------+------+------+------------+-------+
|firstname|middlename|lastname|       dob|gender|salary|copiedcolumn|country|
+---------+----------+--------+----------+------+------+------------+-------+
|    James|          |   Smith|1991-04-01|     M|300000|      300000|  India|
|  Michael|      Rose|        |2000-05-19|     M|400000|      400000|  India|
|   Robert|          |Williams|1978-09-05|     M|400000|      400000|  India|
|    Maria|      Anne|   Jones|1967-12-01|     F|400000|      400000|  India|
+---------+----------+--------+----------+------+------+------------+-------+



# 5. Rename Column Name

In [20]:
df3=df3.withColumnRenamed("dob","data of birth")

In [21]:
df3.show(truncate=True)

+---------+----------+--------+-------------+------+------+------------+-------+
|firstname|middlename|lastname|data of birth|gender|salary|copiedcolumn|country|
+---------+----------+--------+-------------+------+------+------------+-------+
|    James|          |   Smith|   1991-04-01|     M|300000|      300000|  India|
|  Michael|      Rose|        |   2000-05-19|     M|400000|      400000|  India|
|   Robert|          |Williams|   1978-09-05|     M|400000|      400000|  India|
|    Maria|      Anne|   Jones|   1967-12-01|     F|400000|      400000|  India|
+---------+----------+--------+-------------+------+------+------------+-------+



In [22]:
df3.show(truncate=False)

+---------+----------+--------+-------------+------+------+------------+-------+
|firstname|middlename|lastname|data of birth|gender|salary|copiedcolumn|country|
+---------+----------+--------+-------------+------+------+------------+-------+
|James    |          |Smith   |1991-04-01   |M     |300000|300000      |India  |
|Michael  |Rose      |        |2000-05-19   |M     |400000|400000      |India  |
|Robert   |          |Williams|1978-09-05   |M     |400000|400000      |India  |
|Maria    |Anne      |Jones   |1967-12-01   |F     |400000|400000      |India  |
+---------+----------+--------+-------------+------+------+------------+-------+



# 6. Drop Column from PySpark Data Frame

In [23]:
df4=df3.drop("copiedcolumn")

In [24]:
df4.show()

+---------+----------+--------+-------------+------+------+-------+
|firstname|middlename|lastname|data of birth|gender|salary|country|
+---------+----------+--------+-------------+------+------+-------+
|    James|          |   Smith|   1991-04-01|     M|300000|  India|
|  Michael|      Rose|        |   2000-05-19|     M|400000|  India|
|   Robert|          |Williams|   1978-09-05|     M|400000|  India|
|    Maria|      Anne|   Jones|   1967-12-01|     F|400000|  India|
+---------+----------+--------+-------------+------+------+-------+

