# Pandas - Cleaning Data

In [41]:

# Data cleaning means fixing bad data in your data set.
# Bad data could be:
#  - Empty cells (no value or null value)
#  - Data in wrong format (incorrect date format)
#  - Wrong data (SRD 500)
#  - Duplicates

In [1]:
# Import packages
import pandas as pd

In [10]:
# Cleaning the Data
# Removing unnecessary characters. (Wrong data)

# Create dataframe with sample data
df_data = pd.DataFrame({
    'Name': ['John', 'Jennifer', 'Drake', 'Gill'],
    'Age': ['$30', '25 Years', 'test 35', '40'],
    'City': ['New York!', 'Los Angeles', 'Chicago', 'Houston']
})

In [11]:
# Display the data
df_data

Unnamed: 0,Name,Age,City
0,John,$30,New York!
1,Jennifer,25 Years,Los Angeles
2,Drake,test 35,Chicago
3,Gill,40,Houston


In [12]:
# Import package
import re

def keep_only_numbers(input_string):
    # Use re.sub to replace all non-digit characters with an empty string
    result = re.sub(r'\D', '', input_string)
    return result

for index in df_data.index:
    if not isinstance(df_data.loc[index, 'Age'], int):
        df_data.loc[index, 'Age'] = keep_only_numbers(df_data.loc[index, 'Age'])

# Display the data
df_data

Unnamed: 0,Name,Age,City
0,John,30,New York!
1,Jennifer,25,Los Angeles
2,Drake,35,Chicago
3,Gill,40,Houston


In [13]:
# Remove non-digit characters and convert to integer
df_data['Age'] = df_data['Age'].str.extract(r'(\d+)', expand=False).astype(int)

# Display the data
df_data

Unnamed: 0,Name,Age,City
0,John,30,New York!
1,Jennifer,25,Los Angeles
2,Drake,35,Chicago
3,Gill,40,Houston


In [None]:
# Data in wrong format (incorrect date format)

# Create dataframe with sample data
df_data2 = pd.DataFrame({
    'date': ['2014-01-24', '2024-01-25', '20240224', '01/05/2024'],
    'Amount': ['30', '250', '350', '500'],
})

df_data2

Unnamed: 0,date,Amount
0,2014-01-24,30
1,2024-01-25,250
2,20240224,350
3,01/05/2024,500


In [47]:
# Fix date format
df_data2['date'] = pd.to_datetime(df_data2['date'], format='mixed')
df_data2

Unnamed: 0,date,Amount
0,2014-01-24,30
1,2024-01-25,250
2,2024-02-24,350
3,2024-01-05,500


In [48]:
# Empty cells (no value or null value)

# Create dataframe with sample data
df_data3 = pd.DataFrame({
    'date': ['2014-01-24', '2024-01-25', '20240224', '01/05/2024'],
    'Amount': ['30', '250', '350', None],
})

df_data3

Unnamed: 0,date,Amount
0,2014-01-24,30.0
1,2024-01-25,250.0
2,20240224,350.0
3,01/05/2024,


In [49]:
# Check if dataframe contains empty values
df_data3.isna()

Unnamed: 0,date,Amount
0,False,False
1,False,False
2,False,False
3,False,True


In [50]:
# Check if dataframe contains empty values
df_data3.isna().sum()

date      0
Amount    1
dtype: int64