In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('miss').getOrCreate()
file_path = '/FileStore/tables/ContainsNull_1_-020da.csv'

In [2]:
df = spark.read.csv(file_path, inferSchema=True,header=True)

In [3]:
df.show()

In [4]:
# Drop any row that contains missing data
#Drop the missing data
#You can use the .na functions for missing data. The drop command has the following parameters:
#
#df.na.drop(how='any', thresh=None, subset=None)
#
#* param how: 'any' or 'all'.
#
#   If 'any', drop a row if it contains any nulls.
#    If 'all', drop a row only if all its values are null.
#
#* param thresh: int, default None
#
#    If specified, drop rows that have less than `thresh` non-null values.
#    This overwrites the `how` parameter.
#
#* param subset: 
#    optional list of column names to consider.
df.na.drop().show()

In [5]:
# Has to have at least 2 NON-null values
df.na.drop(how='all', thresh=2).show()

In [6]:
# Drop null for Named columns 
df.na.drop(subset=["Name"]).show()

In [7]:
#how If 'any', drop a row if it contains any nulls.
df.na.drop(how='any').show()

In [8]:
# If 'all', drop a row only if all its values are null.
df.na.drop(how='all').show()

In [9]:
#Fill the missing values
#We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up #the data types. For example:
df.na.fill('NEW VALUES').show()

In [10]:
df.na.fill(0).show()

In [11]:
# Usually you should specify what columns you want to fill with the subset parameter
df.na.fill('No Names', subset=['Name']).show()

In [12]:
help(df.na.fill)

In [13]:
df.na.fill({"Name":"No Name","sales":0}).show()

In [14]:
# A very common practice is to fill values with the mean value for the column, for example:
from pyspark.sql.functions import mean
mean_val = df.select(mean(df['Sales'])).collect()

In [15]:
mean_sales=mean_val[0][0]

In [16]:
df.na.fill(mean_sales,["Sales"]).show()