# Cleaning Data in Pandas DataFrames

## 1. Import libraries and dependencies

In [126]:
# Import the pandas and pathlib libraries
import pandas as pd
import numpy as np
from pathlib import Path

## 2. Create a Path to the File Using Pathlib

In [127]:
# Use the Pathlib libary to set the path to the CSV
data1 = Path(r'C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 4\2\03-data-cleaning-01\Resources\people_cleansed.csv')
data2 = Path(r'C:\Users\TribThapa\Desktop\Thapa\ResearchFellow\Courses\FinTech_Bootcamp_MonashUni2021\monu-mel-virt-fin-pt-05-2021-u-c\Activities\Week 4\2\03-data-cleaning-01\Resources\people_reordered.csv')

## 3. Read the CSV into a Pandas DataFrame

In [155]:
# Use the file path to read the CSV into a DataFrame and display a few rows
data1_df = pd.read_csv(data1)
data1_df.head()

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4,Unnamed,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47


In [129]:
data2_df = pd.read_csv(data2)
data2_df

Unnamed: 0.1,Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,3,4.0,,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
...,...,...,...,...,...,...,...,...,...,...
995,995,996.0,Crumpton,Meta,Female,ECAM - Institut Supérieur Industriel,Registered Nurse,57060.0,mcrumptonrn@qq.com,52
996,996,997.0,Gilford,Gunar,Male,Smolny University,Marketing Manager,76109.0,ggilfordro@yandex.ru,32
997,997,998.0,Gurling,Lucretia,Female,Institut Teknologi Telkom,Software Engineer III,92115.0,lgurlingrp@de.vu,48
998,998,999.0,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


## 4. View Column Data Types

In [130]:
# Use the `dtypes` attribute to list the column data types
data1_df.dtypes

Person_ID       int64
Last_Name      object
First_Name     object
Gender         object
University     object
Occupation     object
Salary        float64
Email          object
Age             int64
dtype: object

In [131]:
data2_df.dtypes

Unnamed: 0      int64
Person_ID     float64
Last_Name      object
First_Name     object
Gender         object
University     object
Occupation     object
Salary        float64
Email          object
Age             int64
dtype: object

## 5. Drop Extraneous Columns

In [132]:
# Use the `drop` function to drop specific columns
data2_drop_df = data2_df.drop(columns=('Unnamed: 0'))
data2_drop_df

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
...,...,...,...,...,...,...,...,...,...
995,996.0,Crumpton,Meta,Female,ECAM - Institut Supérieur Industriel,Registered Nurse,57060.0,mcrumptonrn@qq.com,52
996,997.0,Gilford,Gunar,Male,Smolny University,Marketing Manager,76109.0,ggilfordro@yandex.ru,32
997,998.0,Gurling,Lucretia,Female,Institut Teknologi Telkom,Software Engineer III,92115.0,lgurlingrp@de.vu,48
998,999.0,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


---

## 6. Identify Data Quality Issues

### 1. Identify the Number of Rows

In [133]:
# Use the `count` function to view count of non-null values for each column. Count = counts True-s and False
data1_df.notnull().count()

Person_ID     973
Last_Name     973
First_Name    973
Gender        973
University    973
Occupation    973
Salary        973
Email         973
Age           973
dtype: int64

In [134]:
data1_df.notnull().sum()

Person_ID     973
Last_Name     973
First_Name    973
Gender        973
University    973
Occupation    973
Salary        973
Email         973
Age           973
dtype: int64

In [135]:
data1_df.isnull().sum()

Person_ID     0
Last_Name     0
First_Name    0
Gender        0
University    0
Occupation    0
Salary        0
Email         0
Age           0
dtype: int64

### 2. Identify Frequency Counts of the `first_name` Column

In [136]:
# Identifying frequency counts of the `first_name` column
data1_df['First_Name'].value_counts(10)

Unnamed     0.006166
Ailbert     0.003083
Israel      0.003083
Sasha       0.002055
Joey        0.002055
              ...   
Tom         0.001028
Jordan      0.001028
Sadie       0.001028
Osbourne    0.001028
Bordie      0.001028
Name: First_Name, Length: 895, dtype: float64

### 3. Identify Null Values

In [137]:
# Checking for null
data1_df.isna().sum()

Person_ID     0
Last_Name     0
First_Name    0
Gender        0
University    0
Occupation    0
Salary        0
Email         0
Age           0
dtype: int64

### 4. Determine the Number of Nulls

In [138]:
# Determining number of nulls
data1_df.isnull().sum()

Person_ID     0
Last_Name     0
First_Name    0
Gender        0
University    0
Occupation    0
Salary        0
Email         0
Age           0
dtype: int64

### 5. Determining the Percentage of Nulls for each Column

In [139]:
# Determining percentage of nulls
(data1_df.isna().sum()) / (len(data1_df))* 100

Person_ID     0.0
Last_Name     0.0
First_Name    0.0
Gender        0.0
University    0.0
Occupation    0.0
Salary        0.0
Email         0.0
Age           0.0
dtype: float64

### 6. Check for Duplicate Rows

In [140]:
# Use the `duplicated` function to determine the existance of duplicate rows: True or False
data1_df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
968    False
969    False
970    False
971    False
972     True
Length: 973, dtype: bool

### 7. Check for Duplicate `first_name` and `last_name Values

In [141]:
# Use the `duplicated` function in conjunction with a list of columns to determine the existence of duplicate rows based on the selected columns
data1_df[data1_df.duplicated(['First_Name', 'Last_Name'])]

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
972,999,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


In [142]:
data2_drop_df[data2_drop_df.duplicated(['First_Name', 'Last_Name'])]

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
999,999.0,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


---

## 7. Resolve Data Quality Issues

### 1. Fill First_Name and Last_Name Null Values with Default Value "Unnamed"

In [143]:
# Cleanse nulls from DataFrame by filling na
data2_fill = data2_drop_df.fillna('Unknown')
data2_fill.shape

(1000, 9)

In [144]:
data2_fill.loc[data2_fill['Last_Name'] == 'Unknown']

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
3,4.0,Unknown,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
110,111.0,Unknown,Marie,Female,Imperial College School of Medicine,Technical Writer,87405.0,mdewi32@desdev.cn,41
236,237.0,Unknown,Kennan,Male,Northeast Normal University,Nurse Practicioner,54396.0,ktolworth6k@kickstarter.com,34
267,268.0,Unknown,Granthem,Male,Les Roches International School of Hotel Manag...,Budget/Accounting Analyst II,94363.0,gbewsey7f@stanford.edu,38
481,482.0,Unknown,Mikey,Male,Madhya Pradesh Bhoj (Open) University,Structural Analysis Engineer,51799.0,mhandsdd@adobe.com,60
498,499.0,Unknown,Dita,Female,Columbus State University,Systems Administrator I,104632.0,dpettmandu@fc2.com,22
545,546.0,Unknown,Kizzie,Female,Iceland University of Education,Sales Associate,98394.0,kmcellenf5@chicagotribune.com,55
852,853.0,Unknown,Ailbert,Male,Hampshire College,Help Desk Operator,106064.0,acamiliono@seattletimes.com,55
952,953.0,Unknown,Perren,Male,Marymount Manhattan College,Geologist IV,111209.0,pmaidensqg@usnews.com,25
989,990.0,Unknown,Urson,Male,Wilberforce University,Librarian,79603.0,uenrdigorh@gmpg.org,48


### 2. Drop Remaining Records with Nulls from DataFrame

In [145]:
# Use the `dropna` function to drop whole records that have at least one null value
data2_fill.dropna()

Unnamed: 0,Person_ID,Last_Name,First_Name,Gender,University,Occupation,Salary,Email,Age
0,1.0,Lenormand,Keriann,Female,Aurora University,Nurse Practicioner,58135.0,klenormand0@businessinsider.com,27
1,2.0,Rupke,Huntley,Male,Osaka University of Economics,Project Manager,96053.0,hrupke1@reuters.com,22
2,3.0,Dalgarnowch,Gorden,Male,Ludong University,Environmental Tech,59196.0,gdalgarnowch2@microsoft.com,40
3,4.0,Unknown,Cullie,Male,Université des Sciences et de la Technologie d...,Legal Assistant,88493.0,cputten3@nymag.com,62
4,5.0,Strangman,Ariel,Female,Boise State University,Project Manager,89073.0,astrangman4@bravesites.com,47
...,...,...,...,...,...,...,...,...,...
995,996.0,Crumpton,Meta,Female,ECAM - Institut Supérieur Industriel,Registered Nurse,57060.0,mcrumptonrn@qq.com,52
996,997.0,Gilford,Gunar,Male,Smolny University,Marketing Manager,76109.0,ggilfordro@yandex.ru,32
997,998.0,Gurling,Lucretia,Female,Institut Teknologi Telkom,Software Engineer III,92115.0,lgurlingrp@de.vu,48
998,999.0,Yang,Andrew,Male,Rutgers University School of Business,Curriculum Engineer,60000.0,ayang@codedrills.com,53


### 3. Check Null Counts for Each Column (Again)

In [146]:
# Use the `isnull` function in conjunction with the `sum` function to count the number of null values for each column
data2_fill.isnull().sum()

Person_ID     0
Last_Name     0
First_Name    0
Gender        0
University    0
Occupation    0
Salary        0
Email         0
Age           0
dtype: int64

### 4. Cleanse data by Dropping Duplicates

In [147]:
# Use the `drop_duplicates` function with the `subset` parameter to drop duplicates based on a selection of columns
data2_fill_drop = data2_fill.drop_duplicates(subset='Last_Name')
data2_fill_drop.shape

(975, 9)

### 5. Convert Columns to Different DataTypes

In [148]:
# Use the `as_type` function to convert `Person_ID` from `float` to `int`
data1_df.dtypes

Person_ID       int64
Last_Name      object
First_Name     object
Gender         object
University     object
Occupation     object
Salary        float64
Email          object
Age             int64
dtype: object

In [149]:
df_new = data1_df['Person_ID'].astype('float')

In [152]:
data1_df.dtypes

Person_ID       int64
Last_Name      object
First_Name     object
Gender         object
University     object
Occupation     object
Salary        float64
Email          object
Age             int64
dtype: object

## 8. Save Cleansed Data to New CSV

In [154]:
# Save modified DataFrame to the Resources folder. Use the `index` parameter set to `False` to exclude saving the index.
data1_df.to_csv('Task3.csv')