In [None]:
#Working with Missing Data in Pandas

#Missing Data can occur when no information is provided for one or more items or for a whole unit. Missing Data is a very big problem in a real-life scenarios. Missing Data can also refer to as NA(Not Available) values in pandas. In DataFrame sometimes many datasets simply arrive with missing data, either because it exists and was not collected or it never existed

In [None]:
#In Pandas missing data is represented by two value:

None: None is a Python singleton object that is often used for missing data in Python code.
NaN : NaN (an acronym for Not a Number), is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation

#Pandas treat None and NaN as essentially interchangeable for indicating missing or null values. To facilitate this convention, there are several useful functions for detecting, removing, and replacing null values in Pandas DataFrame :

isnull()
notnull()
dropna()
fillna()
replace()
interpolate()

In [1]:
#Checking for missing values using isnull()
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, 45, 56, np.nan], 
		'Third Score':[np.nan, 40, 80, 98]} 

# creating a dataframe from list 
df = pd.DataFrame(dict) 

# using isnull() function 
df.isnull() 


Unnamed: 0,First Score,Second Score,Third Score
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [2]:
#In order to check null values in Pandas Dataframe, we use notnull() function this function return dataframe of Boolean values which are False for NaN values.
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, 45, 56, np.nan], 
		'Third Score':[np.nan, 40, 80, 98]} 

# creating a dataframe using dictionary 
df = pd.DataFrame(dict) 

# using notnull() function 
df.notnull() 


Unnamed: 0,First Score,Second Score,Third Score
0,True,True,False
1,True,True,True
2,False,True,True
3,True,False,True


In [4]:
# importing pandas package 
import pandas as pd 
	
# making data frame from csv file 
data = pd.read_csv("employees.csv") 
	
# creating bool series True for NaN values 
bool_series = pd.notnull(data["Gender"]) 
	
# filtering data 
# displaying data only with Gender = Not NaN 
data[bool_series] 
 


FileNotFoundError: [Errno 2] No such file or directory: 'employees.csv'

In [5]:
#Filling missing values using fillna(), replace() and interpolate()
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, 45, 56, np.nan], 
		'Third Score':[np.nan, 40, 80, 98]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 

# filling missing value using fillna() 
df.fillna(0) 


Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,0.0
1,90.0,45.0,40.0
2,0.0,56.0,80.0
3,95.0,0.0,98.0


In [6]:
#Filling null values with the previous ones 
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, 45, 56, np.nan], 
		'Third Score':[np.nan, 40, 80, 98]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 

# filling a missing value with 
# previous ones 
df.fillna(method ='pad') 


Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,
1,90.0,45.0,40.0
2,90.0,56.0,80.0
3,95.0,56.0,98.0


In [7]:
# Filling null value with the next ones 

# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, 45, 56, np.nan], 
		'Third Score':[np.nan, 40, 80, 98]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 

# filling null value using fillna() function 
df.fillna(method ='bfill') 


Unnamed: 0,First Score,Second Score,Third Score
0,100.0,30.0,40.0
1,90.0,45.0,40.0
2,95.0,56.0,80.0
3,95.0,,98.0


In [10]:
#Filling null values in CSV File 

# importing pandas package 
import pandas as pd 
	
# making data frame from csv file 
data = pd.read_csv("employees.csv") 

# Printing the first 10 to 24 rows of 
# the data frame for visualization 
data[10:25] 




FileNotFoundError: [Errno 2] No such file or directory: 'employees.csv'

In [14]:
#Filling a null values using replace() method 
import pandas as pd
import numpy as np

# Sample data
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, np.nan, 5],
    'C': [1, 2, 3, 4, 5]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Replacing null values with a specific value, e.g., 0
df.replace(np.nan, 0, inplace=True)

print("\nDataFrame after replacing null values:")
print(df)




Original DataFrame:
     A    B  C
0  1.0  NaN  1
1  2.0  2.0  2
2  NaN  3.0  3
3  4.0  NaN  4
4  5.0  5.0  5

DataFrame after replacing null values:
     A    B  C
0  1.0  0.0  1
1  2.0  2.0  2
2  0.0  3.0  3
3  4.0  0.0  4
4  5.0  5.0  5


In [12]:
#Using interpolate() function to fill the missing values using linear method. 
# importing pandas as pd 
import pandas as pd 
	
# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, None, 1], 
				"B":[None, 2, 54, 3, None], 
				"C":[20, 16, None, 3, 8], 
				"D":[14, 3, None, None, 6]}) 
	
# Print the dataframe 
df 


Unnamed: 0,A,B,C,D
0,12.0,,20.0,14.0
1,4.0,2.0,16.0,3.0
2,5.0,54.0,,
3,,3.0,3.0,
4,1.0,,8.0,6.0


In [15]:
#Dropping rows with at least 1 null value. 
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, 40, 80, 98], 
		'Fourth Score':[np.nan, np.nan, np.nan, 65]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 
	
df 


Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [16]:
#Now we drop rows with at least one Nan value (Null value)
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, 90, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, 40, 80, 98], 
		'Fourth Score':[np.nan, np.nan, np.nan, 65]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 

# using dropna() function 
df.dropna() 



Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
3,95.0,56.0,98,65.0


In [17]:
# Dropping rows if all values in that row are missing. 
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, np.nan, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, np.nan, 80, 98], 
		'Fourth Score':[np.nan, np.nan, np.nan, 65]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 
	
df 


Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,
1,,,,
2,,45.0,80.0,
3,95.0,56.0,98.0,65.0


In [18]:
#Now we drop a rows whose all data is missing or contain null values(NaN
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, np.nan, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, np.nan, 80, 98], 
		'Fourth Score':[np.nan, np.nan, np.nan, 65]} 

df = pd.DataFrame(dict) 

# using dropna() function	 
df.dropna(how = 'all') 


Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,
2,,45.0,80.0,
3,95.0,56.0,98.0,65.0


In [19]:
# Dropping columns with at least 1 null value.
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, np.nan, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, np.nan, 80, 98], 
		'Fourth Score':[60, 67, 68, 65]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 
	
df 


Unnamed: 0,First Score,Second Score,Third Score,Fourth Score
0,100.0,30.0,52.0,60
1,,,,67
2,,45.0,80.0,68
3,95.0,56.0,98.0,65


In [20]:
#Now we drop a columns which have at least 1 missing values 
# importing pandas as pd 
import pandas as pd 

# importing numpy as np 
import numpy as np 

# dictionary of lists 
dict = {'First Score':[100, np.nan, np.nan, 95], 
		'Second Score': [30, np.nan, 45, 56], 
		'Third Score':[52, np.nan, 80, 98], 
		'Fourth Score':[60, 67, 68, 65]} 

# creating a dataframe from dictionary 
df = pd.DataFrame(dict) 

# using dropna() function	 
df.dropna(axis = 1) 


Unnamed: 0,Fourth Score
0,60
1,67
2,68
3,65


In [21]:
# Dropping Rows with at least 1 null value in CSV file 
# importing pandas module 
import pandas as pd 
	
# making data frame from csv file 
data = pd.read_csv("employees.csv") 
	
# making new data frame with dropped NA values 
new_data = data.dropna(axis = 0, how ='any') 
	
new_data 


FileNotFoundError: [Errno 2] No such file or directory: 'employees.csv'