In [None]:
import numpy as np
import pandas as pd

In [None]:
# Handling missing data

employee = {
    "emp_id" : [101,102,np.nan, 104],
    "emp_name" : ["Siva", np.nan, "priya","Sankari"],
    "dept" : [np.nan, "Java", "Java","Data"],
    "salary" : [25000, 30000, np.nan, 40000]}


In [None]:
df = pd.DataFrame(employee)
df

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,,25000.0
1,102.0,,Java,30000.0
2,,priya,Java,
3,104.0,Sankari,Data,40000.0


In [None]:
df.isnull().sum()

Unnamed: 0,0
emp_id,1
emp_name,1
dept,1
salary,1


In [None]:
df = df.fillna({'emp_id':0, 'emp_name':'Name missing', 'dept':'Dept missing','salary':0})
df

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,Dept missing,25000.0
1,102.0,Name missing,Java,30000.0
2,0.0,priya,Java,0.0
3,104.0,Sankari,Data,40000.0


In [None]:
# dropping missing data
df.dropna() #row, column (axis=1)

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,Dept missing,25000.0
1,102.0,Name missing,Java,30000.0
2,0.0,priya,Java,0.0
3,104.0,Sankari,Data,40000.0


In [None]:
employee = {
    "emp_id" : [101,102,np.nan, 104],
    "emp_name" : ["Siva", np.nan, "priya","Sankari"],
    "dept" : [np.nan, "Java", "Java","Data"],
    "salary" : [25000, 30000, np.nan, 40000]}

In [None]:
df = pd.DataFrame(employee)
df

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,,25000.0
1,102.0,,Java,30000.0
2,,priya,Java,
3,104.0,Sankari,Data,40000.0


In [None]:
df = df.ffill() # forward fill
df

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,,25000.0
1,102.0,Siva,Java,30000.0
2,102.0,priya,Java,30000.0
3,104.0,Sankari,Data,40000.0


In [None]:
df = df.bfill() #backward fill
df

Unnamed: 0,emp_id,emp_name,dept,salary
0,101.0,Siva,Java,25000.0
1,102.0,Siva,Java,30000.0
2,102.0,priya,Java,30000.0
3,104.0,Sankari,Data,40000.0


In [None]:
sales = {
     "Date":["2024-01-01","2024-01-02","2024-01-03","2024-01-04"],
     "Region":["East", "West", np.nan, "North"],
     "Unit_sold":[10, np.nan, 15,12],
     "Unit_price":[300,250,400, np.nan]

}

In [None]:
df = pd.DataFrame(sales)
df

Unnamed: 0,Date,Region,Unit_sold,Unit_price
0,2024-01-01,East,10.0,300.0
1,2024-01-02,West,,250.0
2,2024-01-03,,15.0,400.0
3,2024-01-04,North,12.0,


In [None]:
df[df["Unit_sold"]==12]

Unnamed: 0,Date,Region,Unit_sold,Unit_price
3,2024-01-04,North,12.0,


In [None]:
df.isnull().sum()

Unnamed: 0,0
Date,0
Region,1
Unit_sold,1
Unit_price,1


In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
# fill with mean
df = df.fillna({"Unit_price": df["Unit_price"].mean()})
df

Unnamed: 0,Date,Region,Unit_sold,Unit_price
0,2024-01-01,East,10.0,300.0
1,2024-01-02,West,,250.0
2,2024-01-03,,15.0,400.0
3,2024-01-04,North,12.0,316.666667


In [None]:
# Drop rows where region is missing
df.dropna(subset=["Region"], inplace=True)
df

Unnamed: 0,Date,Region,Unit_sold,Unit_price
0,2024-01-01,East,10.0,300.0
1,2024-01-02,West,,250.0
3,2024-01-04,North,12.0,316.666667


In [None]:
df[df["Unit_sold"]>10]

Unnamed: 0,Date,Region,Unit_sold,Unit_price
3,2024-01-04,North,12.0,316.666667
