# Company Inc - Sales Data Analysis

## Data Quality

<br>

### This notebook was developed to check a few data quality rules, such as duplicate values in the primary key, or constant values in a specific column



### 0. Libraries import

In [0]:
from pyspark.sql import functions as f #PySpark functions, such as column, maximum, and others

### 1. Function created to check if columns have duplicated values

In [0]:
#Function created to check if columns have duplicated values, the inputs are the table name "table" and the list of columns to check "columns"
def check_duplicates(table, columns):
    #calculate the number of duplicates rows based in the list of columns shared
    res = spark.table(table).select(columns).count() - spark.table(table).select(columns).distinct().count()
    #if the result is zero, there is no duplicates, and print that message
    if res == 0:
        print("\n No duplicates")
    #else, display all the rows with duplicates values based in the list of columns shared
    else:
        display(spark.table(table).groupby(columns).count().filter(f.col("count")>1).head(5))

### 2. Validate the results of fact table "Order"

In [0]:
table = "company.order"
columns = "Order_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

summary,Order_ID,Order_Local_ID,Location_ID,Order_Date,Ship_ID,Customer_ID,Order_Priority_ID,Product_ID,Quantity,Sales,Discount,Profit
count,49399.0,49399,49399.0,49399.0,49399.0,49399,49399.0,49399,49399.0,49399.0,49399.0,49399.0
mean,25733.905079050182,,2005.1237879309297,4883.133059373671,25650.813882872124,,3.119152209558898,,3.4655154962651067,245.96026235348893,0.1428598959493116,28.41432174133076
stddev,14834.219326508684,,1104.1755174352083,412.9701167746836,14781.826308063472,,1.0807386708308595,,2.2789362563490503,485.527242913703,0.2127361517389257,174.49251634081992
min,1.0,AE-2011-9160,1.0,4019.0,1.0,AA-103151,1.0,FUR-ADV-10000002,1.0,0.0,0.0,-6599.978
max,51290.0,ZI-2014-9650,3819.0,5479.0,51106.0,ZD-219254,4.0,TEC-STA-10004927,14.0,22638.0,0.85,8399.976



 No duplicates


### 3. Validate the results of dimension table "Market"

In [0]:
table = "company.market"
columns = "Market_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Market", "Market2"]
check_duplicates(table, columns)

summary,Market_ID,Market,Market2
count,7.0,7,7
mean,4.0,,
stddev,2.160246899469287,,
min,1.0,APAC,APAC
max,7.0,US,North America



 No duplicates

 No duplicates


### 4. Validate the results of dimension table "Location"

In [0]:
table = "company.location"
columns = "Location_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Country", "State", "Region", "City"]
check_duplicates(table, columns)

summary,Location_ID,Market_ID,Country,State,Region,City
count,3819.0,3819.0,3819,3819,3819,3819
mean,1910.0,4.370253993191935,,,,
stddev,1102.5946671374747,2.0519995520874383,,,,
min,1.0,1.0,Afghanistan,'Ajman,Africa,Aachen
max,3819.0,7.0,Zimbabwe,Žilina,West,Águas Lindas de Goiás



 No duplicates

 No duplicates


### 4. Validate the results of dimension table "Calendar_Date"

In [0]:
table = "company.calendar_date"
columns = "Date_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Date"]
check_duplicates(table, columns)

summary,Date_ID
count,36526.0
mean,18263.5
stddev,10544.292302789538
min,1.0
max,36526.0



 No duplicates

 No duplicates


### 5. Validate the results of dimension table "Ship_Mode"

In [0]:
table = "company.ship_mode"
columns = "Ship_Mode_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Ship_Mode"]
check_duplicates(table, columns)

summary,Ship_Mode_ID,Ship_Mode
count,4.0,4
mean,2.5,
stddev,1.2909944487358056,
min,1.0,First Class
max,4.0,Standard Class



 No duplicates

 No duplicates


### 6. Validate the results of dimension table "Ship"

In [0]:
table = "company.ship"
columns = "Ship_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

summary,Ship_ID,Ship_Mode_ID,Ship_Date,Shipping_Cost
count,51106.0,51106.0,51106.0,51106.0
mean,25553.5,3.2527491879622743,4884.615407192893,26.459420878175152
stddev,14753.175765463742,1.0838646239374283,412.5603055437603,57.382300563185424
min,1.0,1.0,4021.0,0.002
max,51106.0,4.0,5486.0,99.98



 No duplicates


### 7. Validate the results of dimension table "Customer"

In [0]:
table = "company.customer"
columns = "Customer_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table, first for the "Customer_Name"
columns = ["Customer_Name"]
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table, second for the "Customer_Name" and "Customer_Segment_ID" in the same time
columns = ["Customer_Segment_ID", "Customer_Name"]
check_duplicates(table, columns)

summary,Customer_ID,Customer_Segment_ID,Customer_Name
count,4873,4873.0,4873
mean,,1.6712497434845064,
stddev,,0.7700977281431572,
min,AA-103151,1.0,Aaron Bergman
max,ZD-219254,3.0,Zuschuss Donatelli



 No duplicates


Customer_Name,count
Jim Mitchum,7
Jesus Ocampo,6
Patrick O'Brill,6
Ted Butterfield,6
Parhena Norris,6


Customer_Segment_ID,Customer_Name,count
2,Eudokia Martin,7
3,Tom Ashbrook,6
3,Christopher Schild,6
2,Odella Nelson,6
1,Jim Karlsson,7


### 8. Validate the results of dimension table "Customer_Segment"

In [0]:
table = "company.customer_segment"
columns = "Customer_Segment_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Segment"]
check_duplicates(table, columns)

summary,Customer_Segment_ID,Segment
count,3.0,3
mean,2.0,
stddev,1.0,
min,1.0,Consumer
max,3.0,Home Office



 No duplicates

 No duplicates


### 9. Validate the results of dimension table "Order_Priority"

In [0]:
table = "company.order_priority"
columns = "Order_Priority_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Order_Priority"]
check_duplicates(table, columns)

summary,Order_Priority_ID,Order_Priority
count,4.0,4
mean,2.5,
stddev,1.2909944487358056,
min,1.0,Critical
max,4.0,Medium



 No duplicates

 No duplicates


### 10. Validate the results of dimension table "Product"

In [0]:
table = "company.product"
columns = "Product_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Product_Category_ID", "Product_Name"]
check_duplicates(table, columns)

summary,Product_ID,Product_Category_ID,Product_Name
count,10292,10292.0,10292
mean,,9.067431014380102,
stddev,,4.8366093594391,
min,FUR-ADV-10000002,1.0,"""While you Were Out"" Message Book, One Form per Page"
max,TEC-STA-10004927,17.0,netTALK DUO VoIP Telephone Service



 No duplicates


Product_Category_ID,Product_Name,count
7,"Wilson Jones Binder, Economy",4
9,"Advantus Push Pins, Metal",3
12,"Smead Folders, Wire Frame",4
5,"Breville Microwave, Silver",5
6,"Sanford Canvas, Blue",6


### 11. Validate the results of dimension table "Product_Category"

In [0]:
table = "company.product_category"
columns = "Product_Category_ID"
#check some statistics results for all the columns
display(spark.table(table).describe())

#check if there are duplicates values for the primary key
check_duplicates(table, columns)

#check if there are duplicates for the content information in the table
columns = ["Category", "Sub_Category"]
check_duplicates(table, columns)

summary,Product_Category_ID,Category,Sub_Category
count,17.0,17,17
mean,9.0,,
stddev,5.049752469181039,,
min,1.0,Furniture,Accessories
max,17.0,Technology,Tables



 No duplicates

 No duplicates
