# Data Cleaning


## 1. Check for missing values


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./mystudents.csv")
df

Unnamed: 0,Student ID,Name,Age,Score,Grade,Province
0,5010110405,Jane,31,53.75,D,Songkla
1,5110110580,Joe,32,37.65,E,Bangkok
2,6310110042,Dan,19,ขาดสอบ,,Chiangmai
3,6310110073,Tida,18,65,C+,Bangkok
4,6310110076,Jane,21,62,C,Yala
5,6310110092,Smile,20,60.5,C,Chiangrai
6,6310110107,Cole,18,,,Phuket
7,6310110109,Tommy,19,51.75,D,Bangkok
8,6310110145,Jerry,19,64.75,C,Nan
9,6310110147,Anna,18,28.25,E,Krabi


In [3]:
df.notnull()

Unnamed: 0,Student ID,Name,Age,Score,Grade,Province
0,True,True,True,True,True,True
1,True,True,True,True,True,True
2,True,True,True,True,False,True
3,True,True,True,True,True,True
4,True,True,True,True,True,True
5,True,True,True,True,True,True
6,True,True,True,False,False,True
7,True,True,True,True,True,True
8,True,True,True,True,True,True
9,True,True,True,True,True,True


In [4]:
df.Score.isnull()

0     False
1     False
2     False
3     False
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
Name: Score, dtype: bool

In [5]:
df.Score.isnull().sum()

1

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Student ID  20 non-null     int64 
 1   Name        20 non-null     object
 2   Age         20 non-null     int64 
 3   Score       19 non-null     object
 4   Grade       18 non-null     object
 5   Province    20 non-null     object
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


## 2. Fill missing values


In [7]:
student_dict = {
    "name": ["Joe", "Sam", "Harry"],
    "age": [20, 21, 19],
    "marks": [85.10, np.nan, 91.54],
}
df = pd.DataFrame(student_dict)
df

Unnamed: 0,name,age,marks
0,Joe,20,85.1
1,Sam,21,
2,Harry,19,91.54


In [8]:
print("Replace NaN with 0")
df1 = df.fillna(0)
df1

Replace NaN with 0


Unnamed: 0,name,age,marks
0,Joe,20,85.1
1,Sam,21,0.0
2,Harry,19,91.54


In [9]:
df.fillna(0, inplace=True)
df

Unnamed: 0,name,age,marks
0,Joe,20,85.1
1,Sam,21,0.0
2,Harry,19,91.54


In [10]:
student_dict = {
    "name": ["Joe", "Sam", "Harry"],
    "age": [20, 21, 19],
    "marks": [85.10, np.nan, 91.54],
}
df = pd.DataFrame(student_dict)
df

Unnamed: 0,name,age,marks
0,Joe,20,85.1
1,Sam,21,
2,Harry,19,91.54


In [11]:
change_dict = {"name": "John Doe", "marks": -100, "total mark": 0}
df1 = df.fillna(value=change_dict)
df1

Unnamed: 0,name,age,marks
0,Joe,20,85.1
1,Sam,21,-100.0
2,Harry,19,91.54


In [12]:
df1 = df.reindex([0, 1, 2, 3])
df1

Unnamed: 0,name,age,marks
0,Joe,20.0,85.1
1,Sam,21.0,
2,Harry,19.0,91.54
3,,,


In [13]:
change_dict = {"name": "John Doe", "marks": -100, "total mark": 0}
df1 = df1.fillna(value=change_dict)
df1

Unnamed: 0,name,age,marks
0,Joe,20.0,85.1
1,Sam,21.0,-100.0
2,Harry,19.0,91.54
3,John Doe,,-100.0
