<img src="StandardDataQualityDimensions.png" alt="StandardDataQualityDimensions.png" style="width: 300px; float: left;"/>

In [1]:
import pandas as pd
import numpy as np
from datetime import time, datetime

# Transactions

**Accuracy**	
* product_first_sold_date

**Completeness**	
* online_order
* brand
* product_line
* product_class
* product_size
* standard_cost
* product_first_sold_date

**Consistency**	
* list_price (no dollar symbol)
* list_price (no dollar symbol)
* standard_cost (dollar symbol)
* cust_id  has values that does not exist in Customer:cust_id

**Currency**	

**Relevancy**	
* standard_cost (purpose of this monetary value)
* list_price (purpose of this monetary value)

**Validity**	
* product_first_sold_date (float value, no date values)

**Duplicates**	


In [4]:
# Importing datafile into dataframe
df_trans = pd.read_csv('Transactions.csv', parse_dates=['transaction_date'])

In [5]:
# Displaying dataframe summary
df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  object        
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  object        
 12  product_first_sold_date  19803 n

In [6]:
df_trans.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,medium,71.49,$53.62,41245.0
1,2,3,3120,2017-05-21,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,$388.92,41701.0
2,3,37,402,2017-10-16,False,Approved,OHM Cycles,Standard,low,medium,1793.43,$248.82,36361.0
3,4,88,3135,2017-08-31,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,$381.10,36145.0
4,5,78,787,2017-10-01,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,$709.48,42226.0


In [7]:
df_trans['transaction_date'].max(), df_trans['transaction_date'].min()

(Timestamp('2017-12-30 00:00:00'), Timestamp('2017-01-01 00:00:00'))

In [8]:
# Convert standard_cost to float datatype

# Remove dollar sign from string
df_trans['standard_cost'] = df_trans['standard_cost'].str.replace('$','')

# Remove comma from string
df_trans['standard_cost'] = df_trans['standard_cost'].str.replace(',','')

# Making the conversion after the extra characters were removed
df_trans['standard_cost'] = df_trans['standard_cost'].astype(float)

In [9]:
df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  object        
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [10]:
# Determine if trans_ids are in sequential order
trans_ids = df_trans['transaction_id'].to_list()
compare_trans_ids = range(1, df_trans.shape[0]+1, 1)

trans_ids == list(compare_trans_ids)

True

In [11]:
# Checking for all possible order_status values
df_trans['order_status'].unique()

array(['Approved', 'Cancelled'], dtype=object)

In [12]:
# Checking for all possible product_class values
df_trans['product_class'].unique()

array(['medium', 'low', 'high', nan], dtype=object)

In [13]:
# Checking for all possible online_order values
df_trans['online_order'].unique()

array([False, True, nan], dtype=object)

In [14]:
# Checking for all possible product_size values
df_trans['product_size'].unique()

array(['medium', 'large', 'small', nan], dtype=object)

In [15]:
# Determine the range for list_price
df_trans['list_price'].min(), df_trans['list_price'].max()

(12.01, 2091.47)

In [16]:
# Determine the range for standard_cost
df_trans['standard_cost'].min(), df_trans['standard_cost'].max()

(7.21, 1759.85)

In [17]:
# Checking for all possible product_size values
df_trans['product_line'].unique()

array(['Standard', 'Road', 'Mountain', 'Touring', nan], dtype=object)

In [18]:
# Checking for all possible brand values
df_trans['brand'].unique()

array(['Solex', 'Trek Bicycles', 'OHM Cycles', 'Norco Bicycles',
       'Giant Bicycles', 'WeareA2B', nan], dtype=object)

In [19]:
# Checking for duplicate rows

df_trans[df_trans.duplicated()]

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date


In [22]:
# Checking for duplicated transaction_ids

df_trans[df_trans['transaction_id'].duplicated()]        

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date


# CustomerAddress


**Accuracy**	
* address

**Completeness**	

**Consistency**	
* state (some values are abbreviated)

**Currency**	

**Relevancy**	
* property_valuation

**Validity**	

**Duplicates**	


In [23]:
# Importing datafile into dataframe

df_cust_addr = pd.read_csv('CustomerAddress.csv')

In [24]:
# Displaying dataframe summary

df_cust_addr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 140.7+ KB


In [25]:
df_cust_addr.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,New South Wales,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9


In [26]:
# Verify if data is for one country

df_cust_addr['country'].unique()

array(['Australia'], dtype=object)

In [27]:
# Determine if customer_ids are in sequential order
customer_ids = df_cust_addr['customer_id'].to_list()
compare_cust_ids = range(1, df_cust_addr.shape[0]+1, 1)

trans_ids == list(compare_cust_ids)

False

In [30]:
df_cust_addr['state'].unique()

array(['New South Wales', 'QLD', 'VIC', 'NSW', 'Victoria'], dtype=object)

In [28]:
# Look for duplicate values

df_cust_addr[df_cust_addr.duplicated()]

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation


In [29]:
# Checking for duplicate customer_id values

df_cust_addr[df_cust_addr['customer_id'].duplicated()]

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation


# Customer Demographic


**Accuracy**	
* default                              

**Completeness**	
* last_name
* DOB
* job_title
* job_industry_category
* default
* tenure

**Consistency**	
* gender
* deceased_indicator (uses N/Y)
* owns_car (uses No / Yes)

**Currency**	

**Relevancy**	
* default
* tenure

**Validity**	

**Duplicates**	



In [None]:
# Data file did not contain columns, so creating temporary columns for dataframe
#columns = ['col' + str(x) for x in range(13)]

# Importing datafile into dataframe
df_cust_demo = pd.read_csv('CustomerDemographic.csv') 

In [None]:
# Displaying dataframe summary

df_cust_demo.info()

In [None]:
# Checking for all possible values for Make and Female

df_cust_demo['gender'].unique()

In [None]:
# Checking for any unreasonable values

df_cust_demo['past_3_years_bike_related_purchases'].unique()

In [None]:
# Chcking for any unreasonable values

df_cust_demo['job_title'].unique()

In [None]:
# Checking for all possible values

df_cust_demo['deceased_indicator'].unique()

In [None]:
# Checking for all possible values

df_cust_demo['owns_car'].unique()

In [None]:
# Checking for all possible values

df_cust_demo['tenure'].value_counts()

In [None]:
df_cust_demo.head()

In [None]:
df_cust_demo.info()

In [None]:
# Checking for duplicated customer_ids

df_cust_demo['customer_id'][df_cust_demo['customer_id'].duplicated()]

In [None]:
# Created custom function because the date columns has multiple date formats. The year has been extracted to determine the age 
# of the customers.  Only one customer has an age of 177 years old, so that would be an outlier and have to be discarded or 
# verified with the company providing the data.

def convertYear(dob):
    currentYear = datetime.now().year
    
    if isinstance(dob, str):
        birthYear = dob.split('/')
        
        if len(birthYear) == 1:
            birthYear = dob.split('-')
            return currentYear - int(birthYear[0])
        else:
            return currentYear - int(birthYear[2])
    else:
        return dob

# Set the age based on the DOB provided
df_cust_demo['age'] = df_cust_demo['DOB'].apply(lambda x : convertYear(x))

In [None]:
df_cust_demo.head()

In [None]:
# Query for max and min DOB. 
# The high value is unreasonable, so add this to the valid column of the Data Quality Framework table

df_cust_demo['age'].max(), df_cust_demo['age'].min()

In [None]:
# Checking for min and max alues in the age column

df_cust_demo.query('age > 100')

In [None]:
# Checking for duplicated rows

df_cust_demo[df_cust_demo.duplicated()]

In [None]:
# Checking for duplicated customer_ids

df_cust_demo[df_cust_demo['customer_id'].duplicated()]

# NewCustomerList

**Accuracy**	

**Completeness**	
* last_name
* DOB
* job_title
* job_industry_category

**Consistency**	

**Currency**	

**Relevancy**	
* job_title
* job_industry_category
* wealth_segment                       
* deceased_indicator
* tenure
* property_valuation
* Rank (with respect to what)
* Value (Is this home value?) 

**Validity**	

**Duplicates**	



In [None]:
# Importing datafile into dataframe
df_new_cust_list = pd.read_csv('NewCustomerList.csv') 

In [None]:
# In original Excel datafile, there are five hidden columns (17, 18, 19, 20, 21)
# Are these values important?

df_new_cust_list.info()

In [None]:
df_new_cust_list.head()

In [None]:
# Determine age based on year. 

current_year = datetime.now().year
df_new_cust_list['age'] = df_new_cust_list['DOB'].apply(lambda x : x if pd.isnull(x) else current_year - int(x[-4:]))
df_new_cust_list['age'] = df_new_cust_list['age'].astype('Int64')

In [None]:
# Checking for min and max alues in the age column

df_new_cust_list['age'].max(), df_new_cust_list['age'].min() 

In [None]:
df_new_cust_list.head()

In [None]:
df_new_cust_list.info()

In [None]:
# Checking for duplicated rows

df_new_cust_list[df_new_cust_list.duplicated()]