# Cleaning Data in Pandas DataFrames

## 1. Import libraries and dependencies

In [None]:
# Import the pandas and pathlib libraries
import pandas as pd
from pathlib import Path

## 2. Create a Path to the File Using Pathlib

In [None]:
# Use the Pathlib libary to set the path to the CSV
csv_path = Path("../Resources/people_reordered.csv")

## 3. Read the CSV into a Pandas DataFrame

In [None]:
# Use the file path to read the CSV into a DataFrame and display a few rows
people_df = pd.read_csv(csv_path)
people_df.head()

## 4. View Column Data Types

In [None]:
# Use the `dtypes` attribute to list the column data types
people_df.dtypes

## 5. Drop Extraneous Columns

In [None]:
# Use the `drop` function to drop specific columns
people_df.drop(columns=['Unnamed: 0'], inplace=True)
people_df.head()

---

## 6. Identify Data Quality Issues

### 1. Identify the Number of Rows

In [None]:
# Use the `count` function to view count of non-null values for each column
people_df.count()

### 2. Identify Frequency Counts of the `first_name` Column

In [None]:
# Identifying frequency counts of the `first_name` column
people_df['First_Name'].value_counts()

### 3. Identify Null Values

In [None]:
# Checking for null
people_df.isnull()

### 4. Determine the Number of Nulls

In [None]:
# Determining number of nulls
people_df.isnull().sum()

### 5. Determining the Percentage of Nulls for each Column

In [None]:
# Determining percentage of nulls
people_df.isnull().sum() / len(people_df) * 100

### 6. Check for Duplicate Rows

In [None]:
# Use the `duplicated` function to determine the existance of duplicate rows: True or False
people_df.duplicated()

### 7. Check for Duplicate `first_name` and `last_name Values

In [None]:
# Use the `duplicated` function in conjunction with a list of columns to 
# determine the existence of duplicate rows based on the selected columns
people_df[['First_Name', 'Last_Name']].duplicated()

---

## 7. Resolve Data Quality Issues

### 1. Fill First_Name and Last_Name Null Values with Default Value "Unnamed"

In [None]:
# Cleanse nulls from DataFrame by filling na
people_df['First_Name'] = people_df['First_Name'].fillna("Unnamed")
people_df['Last_Name'] = people_df['Last_Name'].fillna("Unnamed")
people_df

### 2. Drop Remaining Records with Nulls from DataFrame

In [None]:
# Use the `dropna` function to drop whole records that have at least one null value
people_df.dropna(inplace=True)
people_df

### 3. Check Null Counts for Each Column (Again)

In [None]:
# Use the `isnull` function in conjunction with the `sum` function to count the number of null values for each column
people_df.isnull().sum()

### 4. Cleanse data by Dropping Duplicates

In [None]:
# Use the `drop_duplicates` function with the `subset` parameter to 
# drop duplicates based on a selection of columns
people_df.drop_duplicates(subset=['Last_Name', 'First_Name'])

### 5. Convert Columns to Different DataTypes

In [None]:
# Use the `as_type` function to convert `Person_ID` from `float` to `int`
people_df['Person_ID'] = people_df['Person_ID'].astype('int')
people_df

## 8. Save Cleansed Data to New CSV

In [None]:
# Save modified DataFrame to the Resources folder. Use the `index` parameter set to `False` to exclude saving the index.
people_df.to_csv("../Resources/people_cleansed.csv", index=False)