Data Inconsistency/ Anomalies

In [1]:
import pandas as pd

In [21]:
data = {
    'date': ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021'],
    'country': ['USA', 'U.S.A.', 'America', 'United States'],
    'name': ['Aammar', 'Amaar', 'Hamza', 'Hazma'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
}
# make pandas dataframe
df = pd.DataFrame(data)

In [8]:
# Show data
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,01-12-2022,U.S.A.,Amaar,200.0,150.0
2,2022/12/01,America,Hamza,,300.0
3,12-01-2021,United States,Hazma,200.0,150.0


* Correct the dataformate

In [9]:
# standardize the dataformate
df['date'] = pd.to_datetime(df['date'], errors= 'coerce')
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,,U.S.A.,Amaar,200.0,150.0
2,,America,Hamza,,300.0
3,,United States,Hazma,200.0,150.0


In [10]:
# Impute the 'NaN' values in date with forward fill methord
df['date'] = df['date'].ffill()
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,Aammar,100.0,
1,2021-12-01,U.S.A.,Amaar,200.0,150.0
2,2021-12-01,America,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


* Correct the country name


In [11]:
# Harmonize the country name
country_mapping = {'U.S.A.': 'United States', 'USA': 'United States', 'America': 'United States'}
df['country'] = df['country'].replace(country_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Aammar,100.0,
1,2021-12-01,United States,Amaar,200.0,150.0
2,2021-12-01,United States,Hamza,,300.0
3,2021-12-01,United States,Hazma,200.0,150.0


* Correct the name 


In [13]:
# Harmonize the name
name_mapping = {'Aammar': 'Umar Ajmal', 'Amaar': 'Umar Ajmal', 'Hamza' : 'Umar Ajmal', 'Hazma' : 'Umar Ajmal'}
df['name'] = df['name'].replace(name_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,Umar Ajmal,100.0,
1,2021-12-01,United States,Umar Ajmal,200.0,150.0
2,2021-12-01,United States,Umar Ajmal,,300.0
3,2021-12-01,United States,Umar Ajmal,200.0,150.0


* Correct Contradictory data

In [40]:
# Resolving Contradictory data
# For demonstration, let's assume sale's_2021 should always be higher than sales_2020
# we'll remove rows where condictions are not met.
df = df.drop(df [df['sales_2021'] < df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
