# 1. What Are Missing Values in Data ?
o Missing values are entries in your dataset where no data is recorded for a certain variable or feature. These are often represented as:

o NaN (Not a Number) in NumPy/Pandas

o None in Python

o Empty strings ("") or other placeholders like -, ?, or 534

# 2. Why Do Missing Values Occur?
Here are some common reasons:

📋 Data was not recorded: The user skipped a question in a survey.

❌ Error in data collection: Sensor failed to record data.

📤 Data was lost during transfer or scraping.

🧪 Not applicable: e.g., a "spouse's name" for someone who is single.




In [1]:
# import libraries
import pandas as pd
import numpy as np


In [2]:
# load dataset
dataset_path = r"D:\PandasForDataAnalysis\datasets\missing_values_dataset.csv"

# check the file path or DataFrame
df = pd.read_csv(dataset_path)
if df is None:
    raise FileNotFoundError("File Path Not Found. Please Check The File Path")

# show first 5 rows
df.head()


Unnamed: 0,Name,City,Occupation,Salary,Age
0,Grace,Phoenix,,70000.0,25.0
1,David,New York,Lawyer,80000.0,45.0
2,,Los Angeles,Engineer,,50.0
3,Hannah,Houston,,70000.0,40.0
4,Eva,New York,Artist,70000.0,40.0


In [None]:
# drop the column
# new_df = df.drop("column1", axis = 1)  | df.drop(["Column1","Column2"],axis = 1)

In [6]:
# isnull()
df.isnull()
# df.notnull()

# sum of null values
sum_of_null = df.isnull().sum()
print(f"Sum of null values:\n {sum_of_null}") 

Sum of null values:
 Name          5
City          8
Occupation    3
Salary        6
Age           4
dtype: int64


In [7]:
#select the specific column

name_missing_values = df["Name"].isnull()
sum_of_name_missing = df["Name"].isnull().sum()

print("******************* Missing Values of Name Column *******************")
print(f"Is there missing values? \n {name_missing_values}\n")
print(f"Total missing values of name column: {sum_of_name_missing}\n")
print("*"*len("******************* Missing Values of Name Column *******************")) 

******************* Missing Values of Name Column *******************
Is there missing values? 
 0     False
1     False
2      True
3     False
4     False
5     False
6     False
7     False
8     False
9      True
10     True
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29     True
30     True
31    False
32    False
33    False
34    False
35    False
36    False
37    False
38    False
39    False
Name: Name, dtype: bool

Total missing values of name column: 5

*********************************************************************


In [9]:
age_nan_rows = df[df["Age"].isnull()]
print(age_nan_rows)

      Name         City Occupation   Salary  Age
12     Eva  Los Angeles     Lawyer  60000.0  NaN
15  Hannah      Houston     Artist  50000.0  NaN
27    Ivan          NaN   Engineer  90000.0  NaN
37     Eva          NaN   Engineer  60000.0  NaN


In [11]:
salary_nan_rows = df[df["Salary"].isnull()][["Name","Age"]]
print(salary_nan_rows)

     Name   Age
2     NaN  50.0
8   Grace  50.0
13  David  40.0
25   Jane  25.0
34  David  35.0
39  Grace  50.0


In [None]:
# dropna()  axis = 0 => row, axis = 1 => column
df_copy = df.copy()
# df.dropna(), df.dropna(axis = 1)
# df_copy = df_copy.dropna(axis = 1)

# using of dropna(how = "any") 
df_copy = df_copy.dropna(how= "any")
df_copy.head(10)
# df.dropna

Unnamed: 0,Name,City,Occupation,Salary,Age
1,David,New York,Lawyer,80000.0,45.0
4,Eva,New York,Artist,70000.0,40.0
5,Grace,Houston,Artist,50000.0,40.0
7,Charlie,Los Angeles,Artist,90000.0,40.0
11,Hannah,Phoenix,Lawyer,90000.0,30.0
14,Hannah,Houston,Engineer,70000.0,25.0
16,Charlie,Houston,Artist,60000.0,50.0
17,Frank,Phoenix,Engineer,60000.0,25.0
18,Eva,Chicago,Engineer,80000.0,25.0
20,Hannah,New York,Engineer,70000.0,35.0


In [None]:
#using of dropna(how = "all")
df_copy2 = df.copy()
df_copy2 = df_copy2.dropna(how = "all")
print(df_copy2)

       Name         City Occupation   Salary   Age
0     Grace      Phoenix        NaN  70000.0  25.0
1     David     New York     Lawyer  80000.0  45.0
2       NaN  Los Angeles   Engineer      NaN  50.0
3    Hannah      Houston        NaN  70000.0  40.0
4       Eva     New York     Artist  70000.0  40.0
5     Grace      Houston     Artist  50000.0  40.0
6      Jane          NaN     Doctor  70000.0  40.0
7   Charlie  Los Angeles     Artist  90000.0  40.0
8     Grace  Los Angeles     Doctor      NaN  50.0
9       NaN     New York   Engineer  70000.0  50.0
10      NaN  Los Angeles     Lawyer  50000.0  35.0
11   Hannah      Phoenix     Lawyer  90000.0  30.0
12      Eva  Los Angeles     Lawyer  60000.0   NaN
13    David      Houston     Artist      NaN  40.0
14   Hannah      Houston   Engineer  70000.0  25.0
15   Hannah      Houston     Artist  50000.0   NaN
16  Charlie      Houston     Artist  60000.0  50.0
17    Frank      Phoenix   Engineer  60000.0  25.0
18      Eva      Chicago   Engi

In [None]:
# using of subset parameter
subset_example1 = df.dropna(subset= ["Age","Salary"], how = "all").head(20)
print(subset_example1,"\n")

print("*"*75)
subset_example2 = df.dropna(subset = ["Name","Occupation"], how = "any")
print(subset_example2,"\n")


       Name         City Occupation   Salary   Age
0     Grace      Phoenix        NaN  70000.0  25.0
1     David     New York     Lawyer  80000.0  45.0
2       NaN  Los Angeles   Engineer      NaN  50.0
3    Hannah      Houston        NaN  70000.0  40.0
4       Eva     New York     Artist  70000.0  40.0
5     Grace      Houston     Artist  50000.0  40.0
6      Jane          NaN     Doctor  70000.0  40.0
7   Charlie  Los Angeles     Artist  90000.0  40.0
8     Grace  Los Angeles     Doctor      NaN  50.0
9       NaN     New York   Engineer  70000.0  50.0
10      NaN  Los Angeles     Lawyer  50000.0  35.0
11   Hannah      Phoenix     Lawyer  90000.0  30.0
12      Eva  Los Angeles     Lawyer  60000.0   NaN
13    David      Houston     Artist      NaN  40.0
14   Hannah      Houston   Engineer  70000.0  25.0
15   Hannah      Houston     Artist  50000.0   NaN
16  Charlie      Houston     Artist  60000.0  50.0
17    Frank      Phoenix   Engineer  60000.0  25.0
18      Eva      Chicago   Engi

In [None]:
# thresh parameter in dropna
thresh_example1 = df.dropna(thresh= 4)
print(thresh_example1,"\n")
print("*"*50)

thres_example2 = df.dropna(thresh = 5)
print(thres_example2)
print("*"*50)