# Inconsistent Data Handling

## Import the Data and Libraries

In [1]:
# Import the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [41]:
# Sample Data
data = { 'date': ['2021-12-01', '01-12-2022', 'dec-22-2022', '2021/12/12', '2021-12-01'], 
        'country': ['USA', 'UK', 'United States of America', 'UK', 'USA'], 
        'name': ['ABC', 'xYZ', 'Abc', 'PQS', 'ABC'], 
        'age': [21, 22, 23, 24, 21], 
        'city': ['delhi', 'chandigarh', 'noida', 'gurugram', 'delhi'], 
        'sales_2023': [100, None, 300, 500, 100],
        'sales_2024': [200, 300, None, 300, 200]
    }

df  = pd.DataFrame(data)
df.head()

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,USA,ABC,21,delhi,100.0,200.0
1,01-12-2022,UK,xYZ,22,chandigarh,,300.0
2,dec-22-2022,United States of America,Abc,23,noida,300.0,
3,2021/12/12,UK,PQS,24,gurugram,500.0,300.0
4,2021-12-01,USA,ABC,21,delhi,100.0,200.0


## Standardizing the Date Format

In [42]:
# standardizing the dataset
# First, convert to datetime
df['date'] = pd.to_datetime(df['date'], format='mixed', errors='coerce')

# Then, format as string
df['date'] = df['date'].dt.strftime('%Y-%m-%d')

df.head()

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,USA,ABC,21,delhi,100.0,200.0
1,2022-01-12,UK,xYZ,22,chandigarh,,300.0
2,2022-12-22,United States of America,Abc,23,noida,300.0,
3,2021-12-12,UK,PQS,24,gurugram,500.0,300.0
4,2021-12-01,USA,ABC,21,delhi,100.0,200.0


## Harmonize the Names of the Country 

In [43]:
df.head()

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,USA,ABC,21,delhi,100.0,200.0
1,2022-01-12,UK,xYZ,22,chandigarh,,300.0
2,2022-12-22,United States of America,Abc,23,noida,300.0,
3,2021-12-12,UK,PQS,24,gurugram,500.0,300.0
4,2021-12-01,USA,ABC,21,delhi,100.0,200.0


In [44]:
# get the unique values of the df['country']
unique_countries = df['country'].unique()
print(unique_countries)

['USA' 'UK' 'United States of America']


In [45]:
# As we observe there are 2 names for the USA
# We will make the mapping
country_mapping = {'USA' : 'United States'
                   , 'United States of America' : 'United States'
                   , 'UK' : 'United Kingdom'
                   }
# Now we use the mapping to replace the values in the existing DataFrame
df['country'] = df['country'].replace(country_mapping)
df.head()

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,United States,ABC,21,delhi,100.0,200.0
1,2022-01-12,United Kingdom,xYZ,22,chandigarh,,300.0
2,2022-12-22,United States,Abc,23,noida,300.0,
3,2021-12-12,United Kingdom,PQS,24,gurugram,500.0,300.0
4,2021-12-01,United States,ABC,21,delhi,100.0,200.0


## Typographical Mistakes

In [46]:
# make the first letter in the name column capital and other letter in small
df['name'] = df['name'].str.title() # Inbuilt Function
df

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,United States,Abc,21,delhi,100.0,200.0
1,2022-01-12,United Kingdom,Xyz,22,chandigarh,,300.0
2,2022-12-22,United States,Abc,23,noida,300.0,
3,2021-12-12,United Kingdom,Pqs,24,gurugram,500.0,300.0
4,2021-12-01,United States,Abc,21,delhi,100.0,200.0


## Removing the Duplicates

In [47]:
df.head()

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,United States,Abc,21,delhi,100.0,200.0
1,2022-01-12,United Kingdom,Xyz,22,chandigarh,,300.0
2,2022-12-22,United States,Abc,23,noida,300.0,
3,2021-12-12,United Kingdom,Pqs,24,gurugram,500.0,300.0
4,2021-12-01,United States,Abc,21,delhi,100.0,200.0


In [48]:
# remove the duplicate values
df = df.drop_duplicates() # Drop the exact same tuple
# df = df.drop_duplicates(subset='name') # Drop the duplicate based on a column
print(df)

         date         country name  age        city  sales_2023  sales_2024
0  2021-12-01   United States  Abc   21       delhi       100.0       200.0
1  2022-01-12  United Kingdom  Xyz   22  chandigarh         NaN       300.0
2  2022-12-22   United States  Abc   23       noida       300.0         NaN
3  2021-12-12  United Kingdom  Pqs   24    gurugram       500.0       300.0


## Contradictory Data

In [51]:
# Drop the values where the sales in 2023 > sales in 2024
df = df.drop(df[df['sales_2023'] > df['sales_2024']].index)
df

Unnamed: 0,date,country,name,age,city,sales_2023,sales_2024
0,2021-12-01,United States,Abc,21,delhi,100.0,200.0
1,2022-01-12,United Kingdom,Xyz,22,chandigarh,,300.0
2,2022-12-22,United States,Abc,23,noida,300.0,
