In [67]:
#Importing libraries
import pandas as pd
import numpy as np
import re
import os

In [69]:
# Importing datasets 
dataset_path = r"C:\Users\ayesh\Desktop\DWH Project"

files = {
    "Album": "Album.csv",
    "Album_Errors": "Album_with_errors.csv",
    "Artist": "Artist.csv",
    "Artist_Errors": "Artist_with_errors.csv",
    "Customer": "Customer.csv",
    "Customer_Errors": "Customer_with_errors.csv",
    "Employee": "Employee.csv",
    "Genre": "Genre.csv",
    "Invoice": "Invoice.csv",
    "InvoiceLine": "InvoiceLine.csv",
    "MediaType": "MediaType.csv",
    "Playlist": "Playlist.csv",
    "PlaylistTrack": "PlaylistTrack.csv",
    "Track": "Track.csv"
}

# Load all datasets
dataframes = {}
for name, filename in files.items():
    file_path = os.path.join(dataset_path, filename)
    if os.path.isfile(file_path):
        try:
            df = pd.read_csv(file_path)
            dataframes[name] = df
            print(f"✅ {name} loaded successfully. Shape: {df.shape}")
        except Exception as e:
            print(f"❌ Error loading {name}: {e}")
    else:
        print(f"❌ File not found: {file_path}")

print("\n✅ All datasets loaded successfully.")


✅ Album loaded successfully. Shape: (347, 3)
✅ Album_Errors loaded successfully. Shape: (352, 3)
✅ Artist loaded successfully. Shape: (275, 2)
✅ Artist_Errors loaded successfully. Shape: (279, 2)
✅ Customer loaded successfully. Shape: (59, 13)
✅ Customer_Errors loaded successfully. Shape: (59, 13)
✅ Employee loaded successfully. Shape: (8, 15)
✅ Genre loaded successfully. Shape: (25, 2)
✅ Invoice loaded successfully. Shape: (412, 9)
✅ InvoiceLine loaded successfully. Shape: (2240, 5)
✅ MediaType loaded successfully. Shape: (5, 2)
✅ Playlist loaded successfully. Shape: (18, 2)
✅ PlaylistTrack loaded successfully. Shape: (8715, 2)
✅ Track loaded successfully. Shape: (3503, 9)

✅ All datasets loaded successfully.


In [75]:
# Expected schema 
expected_types = {
    'Album': {'AlbumId': 'Int64', 'Title': 'object', 'ArtistId': 'Int64'},
    'Album_Errors': {'AlbumId': 'Int64', 'Title': 'object', 'ArtistId': 'Int64'},
    'Artist': {'ArtistId': 'Int64', 'Name': 'object'},
    'Artist_Errors': {'ArtistId': 'Int64', 'Name': 'object'},
    'Customer': {
        'CustomerId': 'Int64', 'FirstName': 'object', 'LastName': 'object', 'Company': 'object',
        'Address': 'object', 'City': 'object', 'State': 'object', 'Country': 'object',
        'PostalCode': 'object', 'Phone': 'object', 'Fax': 'object', 'Email': 'object', 'SupportRepId': 'Int64'
    },
    'Customer_Errors': {
        'CustomerId': 'Int64', 'FirstName': 'object', 'LastName': 'object', 'Company': 'object',
        'Address': 'object', 'City': 'object', 'State': 'object', 'Country': 'object',
        'PostalCode': 'object', 'Phone': 'object', 'Fax': 'object', 'Email': 'object', 'SupportRepId': 'Int64'
    },
    'Employee': {
        'EmployeeId': 'Int64', 'LastName': 'object', 'FirstName': 'object', 'Title': 'object',
        'ReportsTo': 'Int64', 'BirthDate': 'datetime64[ns]', 'HireDate': 'datetime64[ns]',
        'Address': 'object', 'City': 'object', 'State': 'object', 'Country': 'object',
        'PostalCode': 'object', 'Phone': 'object', 'Fax': 'object', 'Email': 'object'
    },
    'Genre': {'GenreId': 'Int64', 'Name': 'object'},
    'Invoice': {
        'InvoiceId': 'Int64', 'CustomerId': 'Int64', 'InvoiceDate': 'datetime64[ns]',
        'BillingAddress': 'object', 'BillingCity': 'object', 'BillingState': 'object',
        'BillingCountry': 'object', 'BillingPostalCode': 'object', 'Total': 'float64'
    },
    'InvoiceLine': {
        'InvoiceLineId': 'Int64', 'InvoiceId': 'Int64', 'TrackId': 'Int64', 'UnitPrice': 'float64', 'Quantity': 'Int64'
    },
    'MediaType': {'MediaTypeId': 'Int64', 'Name': 'object'},
    'Playlist': {'PlaylistId': 'Int64', 'Name': 'object'},
    'PlaylistTrack': {'PlaylistId': 'Int64', 'TrackId': 'Int64'},
    'Track': {
        'TrackId': 'Int64', 'Name': 'object', 'AlbumId': 'Int64', 'MediaTypeId': 'Int64',
        'GenreId': 'Int64', 'Composer': 'object', 'Milliseconds': 'Int64', 'Bytes': 'Int64', 'UnitPrice': 'float64'
    }
}

print("=== Starting Data Type Audit ===")
for ds_name, df in dataframes.items():
    print(f"\nChecking dataset: {ds_name}")
    if ds_name not in expected_types:
        print("  No schema info available. Skipping.")
        continue
    schema = expected_types[ds_name]
    for col, expected_dtype in schema.items():
        if col not in df.columns:
            print(f"  ❌ Missing expected column: {col}")
            continue
        actual_dtype = str(df[col].dtype)
        if expected_dtype == 'datetime64[ns]':
            # Check convertibility to datetime
            try:
                pd.to_datetime(df[col], errors='raise')
            except:
                print(f"  ⚠️ Column '{col}' NOT convertible to datetime.")
        elif actual_dtype != expected_dtype:
            print(f"  ⚠️ Column '{col}' type mismatch: expected {expected_dtype}, found {actual_dtype}.")
print("\n=== Data Type Audit Completed ===")


=== Starting Data Type Audit ===

Checking dataset: Album

Checking dataset: Album_Errors

Checking dataset: Artist

Checking dataset: Artist_Errors

Checking dataset: Customer

Checking dataset: Customer_Errors

Checking dataset: Employee

Checking dataset: Genre

Checking dataset: Invoice

Checking dataset: InvoiceLine

Checking dataset: MediaType

Checking dataset: Playlist

Checking dataset: PlaylistTrack

Checking dataset: Track

=== Data Type Audit Completed ===


In [77]:
def fix_data_types(df, schema):
    for col, dtype in schema.items():
        if col not in df.columns:
            continue
        try:
            if dtype == 'Int64':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
            elif dtype == 'float64':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
            elif dtype == 'datetime64[ns]':
                df[col] = pd.to_datetime(df[col], errors='coerce')
            else:
                df[col] = df[col].astype(dtype)
        except Exception as e:
            print(f"Failed to convert {col} in {df.name if hasattr(df, 'name') else 'DataFrame'}: {e}")
    return df

for ds_name, df in dataframes.items():
    if ds_name in expected_types:
        print(f"Fixing data types for {ds_name}...")
        dataframes[ds_name] = fix_data_types(df, expected_types[ds_name])
print("Data types fixed where needed.")


Fixing data types for Album...
Fixing data types for Album_Errors...
Fixing data types for Artist...
Fixing data types for Artist_Errors...
Fixing data types for Customer...
Fixing data types for Customer_Errors...
Fixing data types for Employee...
Fixing data types for Genre...
Fixing data types for Invoice...
Fixing data types for InvoiceLine...
Fixing data types for MediaType...
Fixing data types for Playlist...
Fixing data types for PlaylistTrack...
Fixing data types for Track...
Data types fixed where needed.


In [79]:
#Missing values

print("=== Missing Values Audit ===")
for ds_name, df in dataframes.items():
    print(f"\nDataset: {ds_name}")
    missing_counts = df.isnull().sum()
    if missing_counts.sum() == 0:
        print("  No missing values detected.")
    else:
        print(missing_counts[missing_counts > 0])
print("\n=== Missing Values Audit Completed ===")


=== Missing Values Audit ===

Dataset: Album
  No missing values detected.

Dataset: Album_Errors
AlbumId     5
Title       1
ArtistId    4
dtype: int64

Dataset: Artist
  No missing values detected.

Dataset: Artist_Errors
ArtistId    1
Name        5
dtype: int64

Dataset: Customer
Company       49
State         29
PostalCode     4
Phone          1
Fax           47
dtype: int64

Dataset: Customer_Errors
Company       49
State         29
PostalCode     4
Phone          1
Fax           47
dtype: int64

Dataset: Employee
ReportsTo    1
dtype: int64

Dataset: Genre
  No missing values detected.

Dataset: Invoice
BillingState         202
BillingPostalCode     28
dtype: int64

Dataset: InvoiceLine
  No missing values detected.

Dataset: MediaType
  No missing values detected.

Dataset: Playlist
  No missing values detected.

Dataset: PlaylistTrack
  No missing values detected.

Dataset: Track
Composer    978
dtype: int64

=== Missing Values Audit Completed ===


In [81]:
#Fix missing values where critical (PK/FK columns), fill others

def fix_missing_values(df):
    # Drop rows where any PK or FK columns are missing
    pk_fk_cols = [col for col in df.columns if 'Id' in col or 'Key' in col]
    df.dropna(subset=pk_fk_cols, inplace=True)
    # For non-key columns, fill missing strings with 'Unknown' and numbers with 0
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('Unknown')
        elif pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(0)
        else:
            # For datetime or other types, leave NaNs for now
            pass
    return df

for ds_name, df in dataframes.items():
    print(f"Fixing missing values in {ds_name}...")
    dataframes[ds_name] = fix_missing_values(df)
print("Missing values fixed.")


Fixing missing values in Album...
Fixing missing values in Album_Errors...
Fixing missing values in Artist...
Fixing missing values in Artist_Errors...
Fixing missing values in Customer...
Fixing missing values in Customer_Errors...
Fixing missing values in Employee...
Fixing missing values in Genre...
Fixing missing values in Invoice...
Fixing missing values in InvoiceLine...
Fixing missing values in MediaType...
Fixing missing values in Playlist...
Fixing missing values in PlaylistTrack...
Fixing missing values in Track...
Missing values fixed.


In [83]:
#Duplicates
primary_key_cols = {
    "Album": "AlbumId",
    "Album_Errors": "AlbumId",
    "Artist": "ArtistId",
    "Artist_Errors": "ArtistId",
    "Customer": "CustomerId",
    "Customer_Errors": "CustomerId",
    "Employee": "EmployeeId",
    "Genre": "GenreId",
    "Invoice": "InvoiceId",
    "InvoiceLine": "InvoiceLineId",
    "MediaType": "MediaTypeId",
    "Playlist": "PlaylistId",
    "PlaylistTrack": None,  # composite key, no single PK
    "Track": "TrackId"
}

print("=== Duplicate Primary Key Audit ===")
for ds_name, pk_col in primary_key_cols.items():
    if pk_col is None:
        print(f"{ds_name}: Skipping composite or no single PK.")
        continue
    df = dataframes[ds_name]
    dup_count = df[pk_col].duplicated().sum()
    print(f"{ds_name} - Duplicates in {pk_col}: {dup_count}")
print("=== Duplicate Audit Completed ===")


=== Duplicate Primary Key Audit ===
Album - Duplicates in AlbumId: 0
Album_Errors - Duplicates in AlbumId: 5
Artist - Duplicates in ArtistId: 0
Artist_Errors - Duplicates in ArtistId: 5
Customer - Duplicates in CustomerId: 0
Customer_Errors - Duplicates in CustomerId: 0
Employee - Duplicates in EmployeeId: 0
Genre - Duplicates in GenreId: 0
Invoice - Duplicates in InvoiceId: 0
InvoiceLine - Duplicates in InvoiceLineId: 0
MediaType - Duplicates in MediaTypeId: 0
Playlist - Duplicates in PlaylistId: 0
PlaylistTrack: Skipping composite or no single PK.
Track - Duplicates in TrackId: 0
=== Duplicate Audit Completed ===


In [85]:
# Fix duplicates

for ds_name, pk_col in primary_key_cols.items():
    if pk_col is None:
        continue
    df = dataframes[ds_name]
    print(f"Removing duplicates in {ds_name} based on {pk_col}...")
    dataframes[ds_name] = df.drop_duplicates(subset=[pk_col], keep='first')
print("Duplicate primary keys removed.")


Removing duplicates in Album based on AlbumId...
Removing duplicates in Album_Errors based on AlbumId...
Removing duplicates in Artist based on ArtistId...
Removing duplicates in Artist_Errors based on ArtistId...
Removing duplicates in Customer based on CustomerId...
Removing duplicates in Customer_Errors based on CustomerId...
Removing duplicates in Employee based on EmployeeId...
Removing duplicates in Genre based on GenreId...
Removing duplicates in Invoice based on InvoiceId...
Removing duplicates in InvoiceLine based on InvoiceLineId...
Removing duplicates in MediaType based on MediaTypeId...
Removing duplicates in Playlist based on PlaylistId...
Removing duplicates in Track based on TrackId...
Duplicate primary keys removed.


In [87]:
#Audit foreign key integrity

fk_relations = {
    "Album": ("ArtistId", "Artist", "ArtistId"),
    "Track": ("AlbumId", "Album", "AlbumId"),
    "Invoice": ("CustomerId", "Customer", "CustomerId"),
    "InvoiceLine": ("InvoiceId", "Invoice", "InvoiceId"),
    "InvoiceLine": ("TrackId", "Track", "TrackId"),
    "Track": ("MediaTypeId", "MediaType", "MediaTypeId"),
    "Track": ("GenreId", "Genre", "GenreId"),
    "PlaylistTrack": ("PlaylistId", "Playlist", "PlaylistId"),
    "PlaylistTrack": ("TrackId", "Track", "TrackId")
}

print("=== Foreign Key Integrity Audit ===")
for child, (fk_col, parent, pk_col) in fk_relations.items():
    child_df = dataframes[child]
    parent_df = dataframes[parent]
    invalid_fk = child_df[~child_df[fk_col].isin(parent_df[pk_col])]
    print(f"{child} referencing {parent} on {fk_col}: {len(invalid_fk)} invalid rows")
print("=== Foreign Key Audit Completed ===")


=== Foreign Key Integrity Audit ===
Album referencing Artist on ArtistId: 0 invalid rows
Track referencing Genre on GenreId: 0 invalid rows
Invoice referencing Customer on CustomerId: 0 invalid rows
InvoiceLine referencing Track on TrackId: 0 invalid rows
PlaylistTrack referencing Track on TrackId: 0 invalid rows
=== Foreign Key Audit Completed ===


In [89]:
#Fixing foreign key integrity by removing invalid row

for child, (fk_col, parent, pk_col) in fk_relations.items():
    child_df = dataframes[child]
    parent_df = dataframes[parent]
    print(f"Removing invalid foreign keys in {child} referencing {parent} on {fk_col}...")
    dataframes[child] = child_df[child_df[fk_col].isin(parent_df[pk_col])]
print("Foreign key integrity fixed.")



Removing invalid foreign keys in Album referencing Artist on ArtistId...
Removing invalid foreign keys in Track referencing Genre on GenreId...
Removing invalid foreign keys in Invoice referencing Customer on CustomerId...
Removing invalid foreign keys in InvoiceLine referencing Track on TrackId...
Removing invalid foreign keys in PlaylistTrack referencing Track on TrackId...
Foreign key integrity fixed.


In [91]:
#Date Format Check

date_columns = {
    "Employee": ["BirthDate", "HireDate"],
    "Invoice": ["InvoiceDate"]
}

print("=== Date Format Audit ===")
for ds_name, cols in date_columns.items():
    df = dataframes[ds_name]
    for col in cols:
        invalid_dates = pd.to_datetime(df[col], errors='coerce').isna()
        count_invalid = invalid_dates.sum()
        print(f"{ds_name} - {col} invalid dates count: {count_invalid}")
print("=== Date Audit Completed ===")


=== Date Format Audit ===
Employee - BirthDate invalid dates count: 0
Employee - HireDate invalid dates count: 0
Invoice - InvoiceDate invalid dates count: 0
=== Date Audit Completed ===


In [93]:
#Basic outlier check for key numeric columns

for ds_name, cols in date_columns.items():
    df = dataframes[ds_name]
    for col in cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
        df = df.dropna(subset=[col])
    dataframes[ds_name] = df
print("Date columns fixed.")


Date columns fixed.


In [97]:
#Basic outlier check for key numeric columns

numeric_checks = {
    "InvoiceLine": ["UnitPrice", "Quantity"],
    "Track": ["Milliseconds", "Bytes", "UnitPrice"],
    "Invoice": ["Total"]
}

print("=== Numeric Outlier Audit (Basic) ===")
for ds_name, cols in numeric_checks.items():
    df = dataframes[ds_name]
    for col in cols:
        if col in df.columns:
            invalid_neg = (df[col] < 0).sum()
            print(f"{ds_name} - {col} negative values count: {invalid_neg}")
print("=== Numeric Outlier Audit Completed ===")


=== Numeric Outlier Audit (Basic) ===
InvoiceLine - UnitPrice negative values count: 0
InvoiceLine - Quantity negative values count: 0
Track - Milliseconds negative values count: 0
Track - Bytes negative values count: 0
Track - UnitPrice negative values count: 0
Invoice - Total negative values count: 0
=== Numeric Outlier Audit Completed ===


In [99]:
#Fix by removing negative numeric values

for ds_name, cols in numeric_checks.items():
    df = dataframes[ds_name]
    for col in cols:
        if col in df.columns:
            df = df[df[col] >= 0]
    dataframes[ds_name] = df
print("Negative numeric values removed.")


Negative numeric values removed.


In [101]:
#Checking email format validity (Emails, Phones, Postal Codes)

import re

def check_email_format(df, col='Email'):
    if col not in df.columns:
        return 0
    invalid_emails = df[~df[col].astype(str).str.match(r'^[\w\.-]+@[\w\.-]+\.\w{2,4}$', na=False)]
    return len(invalid_emails)

print("=== Email Format Audit ===")
for ds_name in ['Customer', 'Customer_Errors', 'Employee']:
    df = dataframes[ds_name]
    invalid_count = check_email_format(df)
    print(f"{ds_name}: {invalid_count} invalid emails")
print("=== Email Audit Completed ===")



=== Email Format Audit ===
Customer: 0 invalid emails
Customer_Errors: 0 invalid emails
Employee: 0 invalid emails
=== Email Audit Completed ===


In [103]:
#Fixing emails by replacing invalids with NaN then fillna

def fix_email_format(df, col='Email'):
    if col not in df.columns:
        return df
    df.loc[~df[col].astype(str).str.match(r'^[\w\.-]+@[\w\.-]+\.\w{2,4}$', na=False), col] = np.nan
    df[col].fillna('Unknown', inplace=True)
    return df

for ds_name in ['Customer', 'Customer_Errors', 'Employee']:
    df = dataframes[ds_name]
    dataframes[ds_name] = fix_email_format(df)
print("Emails fixed.")


Emails fixed.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

In [107]:
#Simple cleaning (remove non-numeric for phones, trim, fill unknown)

def clean_phone_postal(df):
    phone_cols = ['Phone', 'Fax']
    postal_cols = ['PostalCode', 'BillingPostalCode']
    for col in phone_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(r'\D+', '', regex=True).replace({'': 'Unknown'})
    for col in postal_cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().replace({'': 'Unknown'})
    return df

for ds_name in ['Customer', 'Employee', 'Invoice']:
    df = dataframes.get(ds_name)
    if df is not None:
        print(f"Cleaning phones and postals in {ds_name}...")
        dataframes[ds_name] = clean_phone_postal(df)

print("Phones and postal codes cleaned.")


Cleaning phones and postals in Customer...
Cleaning phones and postals in Employee...
Cleaning phones and postals in Invoice...
Phones and postal codes cleaned.


In [111]:
#Company Name Standardization

import string

# Company Name Standardization
def standardize_company(df):
    if 'Company' in df.columns:
        df['Company'] = df['Company'].astype(str).str.lower().str.strip()
        df['Company'] = df['Company'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    return df

for ds_name in ['Customer', 'Customer_Errors']:
    df = dataframes.get(ds_name)
    if df is not None:
        print(f"Standardizing company names in {ds_name}...")
        dataframes[ds_name] = standardize_company(df)

print("Company names standardized.")



Standardizing company names in Customer...
Standardizing company names in Customer_Errors...
Company names standardized.


In [113]:
#State Standardization

state_map = {
    'California': 'CA',
    'New York': 'NY',
    'Texas': 'TX',
    # Add full mapping as needed
}

def standardize_state(df):
    if 'State' in df.columns:
        df['State'] = df['State'].map(state_map).fillna(df['State'])
    return df

for ds_name in ['Customer', 'Employee', 'Invoice']:
    df = dataframes.get(ds_name)
    if df is not None:
        print(f"Standardizing states in {ds_name}...")
        dataframes[ds_name] = standardize_state(df)

print("States standardized.")


Standardizing states in Customer...
Standardizing states in Employee...
Standardizing states in Invoice...
States standardized.


In [115]:
#Remove Special Characters From Text

def remove_special_chars(df, cols):
    for col in cols:
        if col in df.columns:
            df[col] = df[col].astype(str).str.replace(r'[^\x00-\x7F]+', '', regex=True)
    return df

for name, df in dataframes.items():
    dataframes[name] = remove_special_chars(df, text_columns)

print("Special characters removed from text columns.")


Special characters removed from text columns.
