# How to drop rows in PySpark?
###  Drop rows with condition in pyspark are accomplished by dropping –  
###  Rows having null values, dropping duplicate rows and dropping rows by specific conditions in a where clause .etc
#####   Drop rows having NA or missing values 
#####   Drop rows with Null values using- where condition
#####   Drop duplicate rows in pyspark
#####   Drop Duplicate rows by keeping the first occurrence 
#####   Drop duplicate rows by keeping the last occurrence 
#####   Drop rows with conditions using where clause
#####  Drop duplicate rows by a specific column


In [4]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
conf = pyspark.SparkConf().setAppName("Drop Rows in Pyspark").setMaster("local")
from pyspark.sql import SQLContext
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sql_c = SQLContext(sc)

In [5]:
df = sql_c.read.csv('Concrete_Data.csv', header=True)

In [6]:
df.show(4)

+------+------+-------+------+----------------+-------+------+----+------+
|Cement| Blast|Fly Ash| Water|Superplasticizer|     CA|    FA| Age|   CMS|
+------+------+-------+------+----------------+-------+------+----+------+
|540.0 |  0.0 |   0.0 |162.0 |            2.5 |1040.0 |676.0 | 28 |79.99 |
|540.0 |  0.0 |   0.0 |162.0 |            2.5 |1055.0 |676.0 | 28 |61.89 |
|332.5 |142.5 |   0.0 |228.0 |            0.0 | 932.0 |594.0 |270 |40.27 |
|332.5 |142.5 |   0.0 |228.0 |            0.0 | 932.0 |594.0 |365 |41.05 |
+------+------+-------+------+----------------+-------+------+----+------+
only showing top 4 rows



In [7]:
### Drop rows with NA or missing values in pyspark
df=df.dropna()


In [9]:
# df.show()

In [11]:
# Drop rows with NA or missing values
# df.na.drop().show()

In [12]:
# drop duplicate rows
from pyspark.sql import Row
df = df.dropDuplicates()
df.show(5)


+------+-----+-------+------+----------------+-------+------+---+------+
|Cement|Blast|Fly Ash| Water|Superplasticizer|     CA|    FA|Age|   CMS|
+------+-----+-------+------+----------------+-------+------+---+------+
|194.7 | 0.0 | 100.5 |165.6 |            7.5 |1006.4 |905.9 |56 |33.96 |
|230.0 | 0.0 | 118.3 |195.5 |            4.6 |1029.4 |758.6 |28 |24.48 |
|213.8 |98.1 |  24.5 |181.7 |            6.7 |1066.0 |785.5 | 3 |13.18 |
|251.8 | 0.0 |  99.9 |146.1 |           12.4 |1006.0 |899.8 |56 |44.14 |
|252.3 | 0.0 |  98.8 |146.3 |           14.2 | 987.8 |889.0 |14 |42.29 |
+------+-----+-------+------+----------------+-------+------+---+------+
only showing top 5 rows



In [13]:
# Drop duplicate rows by a specific column
df.dropDuplicates((['Blast'])).show(5)

+------+------+-------+------+----------------+-------+------+---+------+
|Cement| Blast|Fly Ash| Water|Superplasticizer|     CA|    FA|Age|   CMS|
+------+------+-------+------+----------------+-------+------+---+------+
|286.3 |200.9 |   0.0 |144.7 |           11.2 |1004.6 |803.7 | 3 |24.40 |
|342.0 | 38.0 |   0.0 |228.0 |            0.0 | 932.0 |670.0 |90 |50.46 |
|272.8 |105.1 |  81.8 |209.7 |            9.0 | 904.0 |679.7 |28 |37.17 |
|148.5 |139.4 | 108.6 |192.7 |            6.1 | 892.4 |780.0 |28 |23.70 |
|446.0 | 24.0 |  79.0 |162.0 |           11.6 | 967.0 |712.0 | 3 |25.02 |
+------+------+-------+------+----------------+-------+------+---+------+
only showing top 5 rows



In [16]:
# Drop rows with conditions using where clause
df=df.where("Water == 162.0")

df.show(5)


+------+-----+-------+-----+----------------+---+---+---+---+
|Cement|Blast|Fly Ash|Water|Superplasticizer| CA| FA|Age|CMS|
+------+-----+-------+-----+----------------+---+---+---+---+
+------+-----+-------+-----+----------------+---+---+---+---+



In [None]:
# Rerefence: https://www.datasciencemadesimple.com/get-substring-of-the-column-in-pyspark/