In [14]:
import pandas as pd
import numpy as np

In [15]:
# Imported the data in csv format 
data = {
    "Name": ["Amit", "Neha", "Ravi", "Pooja", None],
    "Age": [20, 21, np.nan, 22, 23],
    "Marks": [85, np.nan, 78, None, 90],
    "City": ["Delhi", "Mumbai", None, "Pune", "Bangalore"]
}
df = pd.DataFrame(data)

# Display the first few rows of the DataFrame
df.head()

# info about the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    4 non-null      object 
 1   Age     4 non-null      float64
 2   Marks   3 non-null      float64
 3   City    4 non-null      object 
dtypes: float64(2), object(2)
memory usage: 292.0+ bytes


# Finding Missing Data

In [None]:
# To get missing values 
df.isnull() # This will return a DataFrame of the same shape with True for missing values and False for non-missing values
df.isna() # This is alies for isnull() and will return the same resul

Unnamed: 0,Name,Age,Marks,City
0,False,False,False,False
1,False,False,True,False
2,False,True,False,True
3,False,False,True,False
4,True,False,False,False


In [None]:
df.notnull() # this is opposite of isnull()/isna()

Unnamed: 0,Name,Age,Marks,City
0,True,True,True,True
1,True,True,False,True
2,True,False,True,False
3,True,True,False,True
4,False,True,True,True


In [None]:
# 

# Removing Missing data

In [5]:
# Remopving rows with missing values
df.dropna() # This will remove all rows that contain at least one missing value

Unnamed: 0,Name,Age,Marks,City
0,Amit,20.0,85.0,Delhi


In [None]:
 # removing rows based on specific column
df.dropna(subset=['Age']) # This will remove rows where the 'Age' column has missing values

Unnamed: 0,Name,Age,Marks,City
0,Amit,20.0,85.0,Delhi
1,Neha,21.0,,Mumbai
3,Pooja,22.0,,Pune
4,,23.0,90.0,Bangalore


In [7]:
# Removing rows only if all values are missing
df.dropna(how='all') # This will remove rows where all values are missing

Unnamed: 0,Name,Age,Marks,City
0,Amit,20.0,85.0,Delhi
1,Neha,21.0,,Mumbai
2,Ravi,,78.0,
3,Pooja,22.0,,Pune
4,,23.0,90.0,Bangalore


In [8]:
# Removing rows if any value is missing 
df.dropna(how='any') # This will remove rows where any value is missing

Unnamed: 0,Name,Age,Marks,City
0,Amit,20.0,85.0,Delhi


In [9]:
# Removing columns with missing values
df.dropna(axis=1) # This will remove all columns that contain at least one missing value

0
1
2
3
4


In [10]:
# Removing columns with missing values based on threshold
df.dropna(thresh=4, axis=1) # This will keep columns that have at least 4 non-missing values

Unnamed: 0,Name,Age,City
0,Amit,20.0,Delhi
1,Neha,21.0,Mumbai
2,Ravi,,
3,Pooja,22.0,Pune
4,,23.0,Bangalore


In [11]:
# Removing missing data based on percentage of missing values
threshold = len(df) * 0.5 # Set threshold to 50% of the total number of rows
df.dropna(thresh=threshold, axis=1) # This will keep columns that have at least 50% non-missing values

Unnamed: 0,Name,Age,Marks,City
0,Amit,20.0,85.0,Delhi
1,Neha,21.0,,Mumbai
2,Ravi,,78.0,
3,Pooja,22.0,,Pune
4,,23.0,90.0,Bangalore


In [None]:
# Removing data based on time series index
# Assuming 'Date' is a datetime index and we want to remove rows with missing values in the last 7 days
df.dropna(subset=['Date'], last='7D') # This will remove rows with missing values in the 'Date' column for the last 7 days

In [13]:
# Removing missing data after filtering
# For example, if we want to remove rows with missing values in the 'Marks' column after filtering for students older than 21
filtered_df = df[df['Age'] > 21] # Filter for students older than 21
filtered_df.dropna(subset=['Marks']) # This will remove rows with missing values in the 'Marks' column from the filtered DataFrame

Unnamed: 0,Name,Age,Marks,City
4,,23.0,90.0,Bangalore
