In [None]:
# Working with Missing Data in Pandas
# Missing Data can occur when no information is provided for one or more
# items or for a whole unit. Missing Data is a very big problem in a real-life scenarios.
# Missing Data can also refer to as NA(Not Available) values in pandas.
# In DataFrame sometimes many datasets simply arrive with missing data,
# either because it exists and was not collected or it never existed.
# For Example, Suppose different users being surveyed may choose not to share their income,
# some users may choose not to share the address in this way many datasets went missing.
 
# Dataset : https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

ufo = pd.read_csv(' https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [2]:
ufo.shape

(18241, 5)

In [3]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18241 entries, 0 to 18240
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   City             18215 non-null  object
 1   Colors Reported  2882 non-null   object
 2   Shape Reported   15597 non-null  object
 3   State            18241 non-null  object
 4   Time             18241 non-null  object
dtypes: object(5)
memory usage: 712.7+ KB


In [7]:
ufo.isnull().sum()

City                  26
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [9]:
# Remove missing values

ufo.dropna().shape
# Here it deletes 15359 rows, that's means we are losing our most of the data. This is not conveniet 

(2486, 5)

In [11]:
# Then how to handle this missing values
# fillna()
# fill the NA/Null values

ufo['City'].fillna(value = 'CityNotMentioned',inplace=True)


In [12]:
ufo.isnull().sum()

City                   0
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [13]:
ufo['Colors Reported'].fillna(value = 'ColorsNotMentioned',inplace=True)


In [14]:
ufo.isnull().sum()

City                  0
Colors Reported       0
Shape Reported     2644
State                 0
Time                  0
dtype: int64

In [15]:
ufo['Shape Reported'].fillna(value = 'ShapeNotMentioned',inplace=True)


In [16]:
ufo.isnull().sum()

City               0
Colors Reported    0
Shape Reported     0
State              0
Time               0
dtype: int64

In [21]:
ufo['Shape Reported'].head(50)

0              TRIANGLE
1                 OTHER
2                  OVAL
3                  DISK
4                 LIGHT
5                  DISK
6                CIRCLE
7                  DISK
8                 CIGAR
9              CYLINDER
10                LIGHT
11             FIREBALL
12               SPHERE
13                 OVAL
14                 DISK
15               CIRCLE
16    ShapeNotMentioned
17    ShapeNotMentioned
18            RECTANGLE
19                OTHER
20                 DISK
21    ShapeNotMentioned
22                LIGHT
23            RECTANGLE
24                 DISK
25                CIGAR
26                 DISK
27                 DISK
28                 DISK
29                 DISK
30               SPHERE
31             CYLINDER
32                CIGAR
33                 DISK
34             FIREBALL
35                 DISK
36            FORMATION
37               SPHERE
38                 DISK
39                 OVAL
40               SPHERE
41              

In [22]:
# https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv

data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [23]:
data.isnull().sum()

Id            0
Name          0
Marks         2
Percentage    4
dtype: int64

In [31]:
data['Marks'].fillna(method = 'ffill',inplace=True)

In [32]:
data['Percentage'].fillna(method = 'bfill',inplace=True)

In [34]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,67.0
2,3,Alex,23.0,67.0
3,4,Alex,12.0,66.0
4,5,Alex,12.0,66.0
5,6,Alex,54.0,66.0
6,7,Alex,65.0,66.0


In [35]:
# fill missing data by taking up the mean-value
#Reloaded the data
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [37]:
mean_val = data['Percentage'].mean()
mean_val = round(mean_val,1)
print(mean_val)
data['Percentage'].fillna(value = mean_val,inplace = True)

70.3


In [38]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,70.3
2,3,Alex,,67.0
3,4,Alex,12.0,70.3
4,5,Alex,,70.3
5,6,Alex,54.0,70.3
6,7,Alex,65.0,66.0


In [39]:
# interpolate()
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
# interpolate()
'''
Python Pandas interpolate() method is used to fill NaN values in the DataFrame
or Series using various interpolation techniques to fill the missing values
rather than hard-coding the value.
Interpolation in Python is a technique used to estimate unknown data points
between two known data points.
'''

In [40]:
data.interpolate()

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,72.5
2,3,Alex,17.5,67.0
3,4,Alex,12.0,66.75
4,5,Alex,33.0,66.5
5,6,Alex,54.0,66.25
6,7,Alex,65.0,66.0


## Duplicate Records

In [41]:
# Dealing with duplicate data
# Create a dataframe
 
data = {'StudentName':['John','Smith','Alex','Bob','John','Ali'],
       'Score':[45,65,76,44,45,39]}
df = pd.DataFrame(data)
df

Unnamed: 0,StudentName,Score
0,John,45
1,Smith,65
2,Alex,76
3,Bob,44
4,John,45
5,Ali,39


In [43]:
#Check for duplicate
df.duplicated()

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool

In [44]:
df.duplicated().sum()

1

In [45]:
df[df.duplicated()]

Unnamed: 0,StudentName,Score
4,John,45


In [46]:
df[df['StudentName']=='John']

Unnamed: 0,StudentName,Score
0,John,45
4,John,45


In [47]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,StudentName,Score
0,John,45
1,Smith,65
2,Alex,76
3,Bob,44
5,Ali,39


In [49]:
data = {'StudentName':['John','Smith','Alex','Bob','John','Ali'],
       'Score':[45,65,76,44,45,39]}
df = pd.DataFrame(data)

In [50]:
df.duplicated(keep = 'last')

0     True
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [51]:
df.drop_duplicates(inplace=True,keep='last')
df

Unnamed: 0,StudentName,Score
1,Smith,65
2,Alex,76
3,Bob,44
4,John,45
5,Ali,39
