# Data Inconsistency

In [33]:
import pandas as pd
import datetime as dt

In [54]:
data = {
'date': ['2021-12-01', '01-12-2022', '2022/12/01', '12-01-2021'],
'country' : ['USA', 'U.S.A.', 'America', 'United States' ],
'name' : ['John Doe', 'Jonh Doe' , 'Jane Doe' , 'Jane Doe' ],
'sales_2020': [100, 200, None, 200],
'sales_2021': [None, 150, 300, 150]
}

In [55]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,01-12-2022,U.S.A.,Jonh Doe,200.0,150.0
2,2022/12/01,America,Jane Doe,,300.0
3,12-01-2021,United States,Jane Doe,200.0,150.0


In [36]:
# standardize date format 
df["date"] = pd.to_datetime(df["date"], errors='coerce')
df["date"] = df["date"].dt.strftime('%Y-%m-%d')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,,U.S.A.,Jonh Doe,200.0,150.0
2,,America,Jane Doe,,300.0
3,,United States,Jane Doe,200.0,150.0


## fill the date in consistent consistent maner

In [56]:
from dateutil import parser
# Function to normalize date formats
def normalize_date(date):
    try:
        # Automatically parse the date and format it to YYYY-MM-DD
        return pd.to_datetime(parser.parse(date)).strftime('%Y-%m-%d')
    except Exception as e:
        print(f"Error parsing date: {date} - {e}")
        return None
    
# Apply the function to the column
df['date'] = df['date'].apply(normalize_date)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,2022-01-12,U.S.A.,Jonh Doe,200.0,150.0
2,2022-12-01,America,Jane Doe,,300.0
3,2021-12-01,United States,Jane Doe,200.0,150.0


In [57]:
# Harmonize country names
country_mapping = {
    'U.S.A.': 'USA',
    'America': 'USA',
    'United States': 'USA'
}

df["country"] = df["country"].replace(country_mapping)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,2022-01-12,USA,Jonh Doe,200.0,150.0
2,2022-12-01,USA,Jane Doe,,300.0
3,2021-12-01,USA,Jane Doe,200.0,150.0


In [58]:
# Correct the typographical errors in the name column
df["name"] = df["name"].replace('Jonh Doe',"John Doe")
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,2022-01-12,USA,John Doe,200.0,150.0
2,2022-12-01,USA,Jane Doe,,300.0
3,2021-12-01,USA,Jane Doe,200.0,150.0


### Remove duplicates based on column name

In [59]:
df = df.drop_duplicates(subset="name")
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
2,2022-12-01,USA,Jane Doe,,300.0


In [60]:
# 5. Resolving Contradictory Data
# For demonstration, let's assume sales_2021 should always be higher than sales_2020
# We'll remove rows where this condition is not met
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
2,2022-12-01,USA,Jane Doe,,300.0
