In [1]:
import pandas as pd
import numpy as np

In [3]:
data = {
    'StudentID': [101, 102, 103, 104, 105, 106, 107],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emily', 'Frank', 'Grace'],
    'Subject': ['Math', 'Science', 'Math', 'English', 'Science', 'Math', 'English'],
    'StudyHours': [2.5, 4.0, 1.5, 3.0, 5.5, np.nan, 2.0], # Explicitly using np.nan for missing value
    'Grade': [85, 92, 78, 88, 95, 65, 81]
}
grades_df=pd.DataFrame(data)
print(grades_df)

   StudentID     Name  Subject  StudyHours  Grade
0        101    Alice     Math         2.5     85
1        102      Bob  Science         4.0     92
2        103  Charlie     Math         1.5     78
3        104    David  English         3.0     88
4        105    Emily  Science         5.5     95
5        106    Frank     Math         NaN     65
6        107    Grace  English         2.0     81


1. Identifying the missing data

In [4]:
grades_df.head()

Unnamed: 0,StudentID,Name,Subject,StudyHours,Grade
0,101,Alice,Math,2.5,85
1,102,Bob,Science,4.0,92
2,103,Charlie,Math,1.5,78
3,104,David,English,3.0,88
4,105,Emily,Science,5.5,95


In [5]:
print(grades_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   StudentID   7 non-null      int64  
 1   Name        7 non-null      object 
 2   Subject     7 non-null      object 
 3   StudyHours  6 non-null      float64
 4   Grade       7 non-null      int64  
dtypes: float64(1), int64(2), object(2)
memory usage: 412.0+ bytes
None


In [7]:
print(grades_df.isnull())

   StudentID   Name  Subject  StudyHours  Grade
0      False  False    False       False  False
1      False  False    False       False  False
2      False  False    False       False  False
3      False  False    False       False  False
4      False  False    False       False  False
5      False  False    False        True  False
6      False  False    False       False  False


In [8]:
print(grades_df.isnull().sum())

StudentID     0
Name          0
Subject       0
StudyHours    1
Grade         0
dtype: int64


2. Handling missing values

Strategy 1: Dropping the Missing Data

In [9]:
# Option A: Reassignment (Recommended)
grades_dropped_df=grades_df.dropna()
print(grades_dropped_df)

   StudentID     Name  Subject  StudyHours  Grade
0        101    Alice     Math         2.5     85
1        102      Bob  Science         4.0     92
2        103  Charlie     Math         1.5     78
3        104    David  English         3.0     88
4        105    Emily  Science         5.5     95
6        107    Grace  English         2.0     81


In [None]:
# Option B: Using inplace (Use with caution)
# grades_df.dropna(inplace=True)

Strategy 2: Filling the Missing Data

In [12]:
grades_df_copy=grades_df.copy()
print(grades_df_copy)

   StudentID     Name  Subject  StudyHours  Grade
0        101    Alice     Math         2.5     85
1        102      Bob  Science         4.0     92
2        103  Charlie     Math         1.5     78
3        104    David  English         3.0     88
4        105    Emily  Science         5.5     95
5        106    Frank     Math         NaN     65
6        107    Grace  English         2.0     81


a. Filling the Missing Data with calculated value

In [13]:
study_hours_mean = grades_df['StudyHours'].mean()

In [14]:
grades_df_copy['StudyHours'] = grades_df_copy['StudyHours'].fillna(study_hours_mean)

In [15]:
print(grades_df_copy)

   StudentID     Name  Subject  StudyHours  Grade
0        101    Alice     Math    2.500000     85
1        102      Bob  Science    4.000000     92
2        103  Charlie     Math    1.500000     78
3        104    David  English    3.000000     88
4        105    Emily  Science    5.500000     95
5        106    Frank     Math    3.083333     65
6        107    Grace  English    2.000000     81


b. Filling the Missing Data with fixed value

In [16]:
grades_df_copy1=grades_df.copy()

In [17]:
grades_df_copy1['StudyHours'] = grades_df_copy1['StudyHours'].fillna(1)
print(grades_df_copy1)

   StudentID     Name  Subject  StudyHours  Grade
0        101    Alice     Math         2.5     85
1        102      Bob  Science         4.0     92
2        103  Charlie     Math         1.5     78
3        104    David  English         3.0     88
4        105    Emily  Science         5.5     95
5        106    Frank     Math         1.0     65
6        107    Grace  English         2.0     81
