In [1]:
import pyspark

In [2]:
#create spark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Practice").getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/06/20 20:38:05 WARN Utils: Your hostname, xiaochen-MS-7C56, resolves to a loopback address: 127.0.1.1; using 192.168.0.160 instead (on interface enp42s0)
25/06/20 20:38:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/20 20:38:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/20 20:38:05 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/06/20 20:38:05 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
# read data
spark_df=spark.read.csv('test_file.csv',header=True,enforceSchema=True)
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [4]:
#list of columns
spark_df.columns

['Name', 'Department', 'Age', 'experience', 'salary']

In [5]:
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- experience: string (nullable = true)
 |-- salary: string (nullable = true)



In [6]:
#Cast Columns to the Correct Data Types:

from pyspark.sql.types import IntegerType

cols=["Age","experience","salary"]
for col in cols:
  spark_df = spark_df.withColumn(col, spark_df[col].cast(IntegerType()))
spark_df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [8]:
#select columns
sub_df=spark_df.select(['Name','Department', 'Age'])
sub_df.show()


+----+----------+----+
|Name|Department| Age|
+----+----------+----+
|  AA|        D1|  25|
|  BB|        D1|  30|
|  CC|        D2|  35|
|  DD|        D3|  22|
|  EE|        D2|NULL|
|NULL|      NULL|NULL|
|  FF|      NULL|  44|
|  ZZ|        D3|  23|
|  KK|        D4|NULL|
+----+----------+----+



In [9]:
#Check the size of data

print(spark_df.count(),len(spark_df.columns))

9 5


In [10]:
#check data types
spark_df.dtypes

[('Name', 'string'),
 ('Department', 'string'),
 ('Age', 'int'),
 ('experience', 'int'),
 ('salary', 'int')]

In [11]:
# Describe dataframe
spark_df.describe().show()

25/06/20 20:38:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+-------+----+----------+------------------+------------------+------------------+
|summary|Name|Department|               Age|        experience|            salary|
+-------+----+----------+------------------+------------------+------------------+
|  count|   8|         7|                 6|                 8|                 8|
|   mean|NULL|      NULL|29.833333333333332|             2.375|             76.75|
| stddev|NULL|      NULL| 8.471520918150805|1.0606601717798212|20.513062040702607|
|    min|  AA|        D1|                22|                 1|                46|
|    max|  ZZ|        D4|                44|                 4|               110|
+-------+----+----------+------------------+------------------+------------------+



In [12]:
#Adding new column
spark_df=spark_df.withColumn('experience_after_2_years',spark_df['experience']+2)
spark_df.show()

+----+----------+----+----------+------+------------------------+
|Name|Department| Age|experience|salary|experience_after_2_years|
+----+----------+----+----------+------+------------------------+
|  AA|        D1|  25|         1|    70|                       3|
|  BB|        D1|  30|         3|   100|                       5|
|  CC|        D2|  35|         4|   110|                       6|
|  DD|        D3|  22|         1|    60|                       3|
|  EE|        D2|NULL|         3|    75|                       5|
|NULL|      NULL|NULL|      NULL|  NULL|                    NULL|
|  FF|      NULL|  44|         3|    73|                       5|
|  ZZ|        D3|  23|         2|    46|                       4|
|  KK|        D4|NULL|         2|    80|                       4|
+----+----------+----+----------+------+------------------------+



In [13]:
#drop column(s)
spark_df=spark_df.drop('experience_after_2_years')
spark_df.show()

+----+----------+----+----------+------+
|Name|Department| Age|experience|salary|
+----+----------+----+----------+------+
|  AA|        D1|  25|         1|    70|
|  BB|        D1|  30|         3|   100|
|  CC|        D2|  35|         4|   110|
|  DD|        D3|  22|         1|    60|
|  EE|        D2|NULL|         3|    75|
|NULL|      NULL|NULL|      NULL|  NULL|
|  FF|      NULL|  44|         3|    73|
|  ZZ|        D3|  23|         2|    46|
|  KK|        D4|NULL|         2|    80|
+----+----------+----+----------+------+



In [14]:
#rename column(s)
spark_df.withColumnRenamed('Name','Full Name').show()

+---------+----------+----+----------+------+
|Full Name|Department| Age|experience|salary|
+---------+----------+----+----------+------+
|       AA|        D1|  25|         1|    70|
|       BB|        D1|  30|         3|   100|
|       CC|        D2|  35|         4|   110|
|       DD|        D3|  22|         1|    60|
|       EE|        D2|NULL|         3|    75|
|     NULL|      NULL|NULL|      NULL|  NULL|
|       FF|      NULL|  44|         3|    73|
|       ZZ|        D3|  23|         2|    46|
|       KK|        D4|NULL|         2|    80|
+---------+----------+----+----------+------+



In [15]:
#rename column(s)
#new_df=spark_df.withColumnRenamed('Name','Full Name').withColumnRenamed('Department','Sector')

new_df=spark_df.withColumnsRenamed({'Name':'Full Name','Department':'Sector' })
new_df.show()

+---------+------+----+----------+------+
|Full Name|Sector| Age|experience|salary|
+---------+------+----+----------+------+
|       AA|    D1|  25|         1|    70|
|       BB|    D1|  30|         3|   100|
|       CC|    D2|  35|         4|   110|
|       DD|    D3|  22|         1|    60|
|       EE|    D2|NULL|         3|    75|
|     NULL|  NULL|NULL|      NULL|  NULL|
|       FF|  NULL|  44|         3|    73|
|       ZZ|    D3|  23|         2|    46|
|       KK|    D4|NULL|         2|    80|
+---------+------+----+----------+------+



In [16]:
#Rename columns
spark_df.withColumnRenamed('salary','wage').show()

+----+----------+----+----------+----+
|Name|Department| Age|experience|wage|
+----+----------+----+----------+----+
|  AA|        D1|  25|         1|  70|
|  BB|        D1|  30|         3| 100|
|  CC|        D2|  35|         4| 110|
|  DD|        D3|  22|         1|  60|
|  EE|        D2|NULL|         3|  75|
|NULL|      NULL|NULL|      NULL|NULL|
|  FF|      NULL|  44|         3|  73|
|  ZZ|        D3|  23|         2|  46|
|  KK|        D4|NULL|         2|  80|
+----+----------+----+----------+----+



In [17]:
#Stop session

spark.stop()