In [18]:
import pandas as pd

df = pd.read_csv("work-order-management-module.csv")

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206058 entries, 0 to 206057
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   SVC_REQUEST_NUMBER              206058 non-null  int64 
 1   WORKORDER_NUMBER                206058 non-null  int64 
 2   WORKORDER_ACTIVITY_CODE         206045 non-null  object
 3   WORKORDER_ACTIVITY_DESCRIPTION  206045 non-null  object
 4   WORKORDER_STARTED               176702 non-null  object
 5   WORKORDER_COMPLETED             179272 non-null  object
 6   WORKORDER_ADDED                 206058 non-null  object
 7   TIME_STAMP                      206058 non-null  object
dtypes: int64(2), object(6)
memory usage: 12.6+ MB


In [20]:
df.head()

Unnamed: 0,SVC_REQUEST_NUMBER,WORKORDER_NUMBER,WORKORDER_ACTIVITY_CODE,WORKORDER_ACTIVITY_DESCRIPTION,WORKORDER_STARTED,WORKORDER_COMPLETED,WORKORDER_ADDED,TIME_STAMP
0,860449,895401,SA25,REPAIR CONNECTION,,,2004-07-16T10:00:04.000,2015-07-07T00:45:01.000
1,168089,100322,SA27,PERFORM OTHER MANHOLE WORK,,,2000-06-01T13:20:07.000,2015-06-29T00:40:55.000
2,428803,843089280,WA10,RESET/REPLACE VALVE BOX,2015-06-10T08:20:00.000,2015-06-10T11:00:00.000,2002-04-08T23:03:19.000,2015-06-16T00:45:01.000
3,185634611,842762063,SA26,REPLACE MANHOLE CASTING,2016-02-25T00:45:00.000,2016-02-25T03:00:00.000,2013-09-03T09:13:14.000,2016-02-26T00:45:03.000
4,895478,842882322,SRL,LINE SEWER (CONTRACTOR),2016-11-29T00:00:00.000,2016-12-13T00:00:00.000,2014-04-23T13:47:23.000,2017-01-10T00:45:01.000


In [21]:
# Cleaning the Data

# ---------------------------------------------------------------------------------------

# Removing Columns not useful for analysis

# The 'TIME_STAMP' column is not included in the analysis.
# It only shows the date that the data was exported from the Client's ERP to excel which is not relevant for our project.

df = df.drop(columns=['TIME_STAMP'])


# ---------------------------------------------------------------------------------------

# Out of Range Datetime Values

# Capture and export invalid out of range dates in Datetime Columns

def get_out_of_range_datetimes(time_description):
    invalid_dates = pd.DataFrame()

    for element in time_description:
        # where to_datetime fails. 
        # dt means datetime
        not_dt = pd.to_datetime(df[element], errors='coerce')

        # where column is not null and to_datetime method fails. 
        # ofr means out of range
        ofr_dt = not_dt.isna() & df[element].notnull()
        
        # Important to do the previous step as there are several blank rows for the Datetime,
        # which makes sense because the Work order may not have been started and/or
        # completed at the time of processing the data. 
        # So we are looking for 'not null' rows that are also incorrect datetimes.
        
        ofr_dt_ = df[[element, "WORKORDER_NUMBER"]].loc[ofr_dt == True]
        ofr_dt_ = pd.DataFrame(ofr_dt_)
        ofr_dt_ = ofr_dt_.assign(Time_type = element)
        ofr_dt_ = ofr_dt_.rename(columns={element: "Wrong_Datetimes"})
        
        invalid_dates = pd.concat([invalid_dates, ofr_dt_])
        
    return invalid_dates

date_columns = ['WORKORDER_STARTED', 'WORKORDER_COMPLETED', 'WORKORDER_ADDED']

cleaning_export_wrong_dates = get_out_of_range_datetimes(date_columns)

if not cleaning_export_wrong_dates.empty:
    cleaning_export_wrong_dates.to_csv('cleaning_export_wrong_dates.csv')

    
    
# Converting the out of range values and blank rows to NA values

# Attempt to infer format of each date, and return NA for rows where conversion failed
for element in date_columns:
    df[element] = pd.to_datetime(df[element], infer_datetime_format=True, errors = 'coerce') 

    
# ---------------------------------------------------------------------------------------

# Data Type Constraints

# Enforce WORKORDER_ACTIVITY_CODE and WORKORDER_ACTIVITY_DESCRIPTION to 'String' type

df['WORKORDER_ACTIVITY_CODE'] = df['WORKORDER_ACTIVITY_CODE'].astype('str')
df['WORKORDER_ACTIVITY_DESCRIPTION'] = df['WORKORDER_ACTIVITY_DESCRIPTION'].astype('str')

# String length constraints on WORKORDER_ACTIVITY_CODE and WORKORDER_ACTIVITY_DESCRIPTION
# Truncate the specified column to specific length of characters
df['WORKORDER_ACTIVITY_CODE'] = df['WORKORDER_ACTIVITY_CODE'].str.slice(stop=12)
df['WORKORDER_ACTIVITY_DESCRIPTION'] = df['WORKORDER_ACTIVITY_DESCRIPTION'].str.slice(stop=300)

# Assert the data type of WORKORDER_NUMBER is int64
assert df['WORKORDER_NUMBER'].dtype == 'int64', "WORKORDER_NUMBER should be int64"

# Assert the data type of SVC_REQUEST_NUMBER is int64
assert df['SVC_REQUEST_NUMBER'].dtype == 'int64', "SVC_REQUEST_NUMBER should be int64"

# Assert the data type of WORKORDER_STARTED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_STARTED']), "WORKORDER_STARTED should be datetime64"

# Assert the data type of WORKORDER_COMPLETED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_COMPLETED']), "WORKORDER_COMPLETED should be datetime64"

# Assert the data type of WORKORDER_ADDED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_ADDED']), "WORKORDER_ADDED should be datetime64"

# Assert the data type of WORKORDER_ACTIVITY_CODE is object (string)
assert df['WORKORDER_ACTIVITY_CODE'].dtype == 'object', "WORKORDER_ACTIVITY_CODE should be object (string)"

# Assert the data type of WORKORDER_ACTIVITY_DESCRIPTION is object (string)
assert df['WORKORDER_ACTIVITY_DESCRIPTION'].dtype == 'object', "WORKORDER_ACTIVITY_DESCRIPTION should be object (string)"


# ---------------------------------------------------------------------------------------

# Removing Duplicate Data

# Drop duplicates based on 'WORKORDER_NUMBER' column and reset the index
df = df.drop_duplicates(subset=['WORKORDER_NUMBER']).reset_index(drop=True)
# Create a new column called 'WorkOrderID' for the index
df['WorkOrderID'] = df.index
# Rearrange the column order
df = df[['WorkOrderID', 
         'WORKORDER_NUMBER', 
         'WORKORDER_ACTIVITY_CODE', 
         'WORKORDER_ACTIVITY_DESCRIPTION', 
         'SVC_REQUEST_NUMBER', 
         'WORKORDER_STARTED', 
         'WORKORDER_COMPLETED', 
         'WORKORDER_ADDED']]

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196028 entries, 0 to 196027
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   WorkOrderID                     196028 non-null  int64         
 1   WORKORDER_NUMBER                196028 non-null  int64         
 2   WORKORDER_ACTIVITY_CODE         196028 non-null  object        
 3   WORKORDER_ACTIVITY_DESCRIPTION  196028 non-null  object        
 4   SVC_REQUEST_NUMBER              196028 non-null  int64         
 5   WORKORDER_STARTED               168573 non-null  datetime64[ns]
 6   WORKORDER_COMPLETED             171146 non-null  datetime64[ns]
 7   WORKORDER_ADDED                 196028 non-null  datetime64[ns]
dtypes: datetime64[ns](3), int64(3), object(2)
memory usage: 12.0+ MB


In [23]:
# Extract 'wo_activity_' table
wo_activity_dim = df[['WORKORDER_ACTIVITY_CODE', 'WORKORDER_ACTIVITY_DESCRIPTION']].drop_duplicates().rename(
    columns={'WORKORDER_ACTIVITY_CODE': 'ActivityCode', 'WORKORDER_ACTIVITY_DESCRIPTION': 'ActivityDescription'}
).reset_index(drop=True)
wo_activity_dim['ActivityID'] = range(1, len(wo_activity_dim) + 1)  # Assign unique IDs

wo_activity_dim.head()

Unnamed: 0,ActivityCode,ActivityDescription,ActivityID
0,SA25,REPAIR CONNECTION,1
1,SA27,PERFORM OTHER MANHOLE WORK,2
2,WA10,RESET/REPLACE VALVE BOX,3
3,SA26,REPLACE MANHOLE CASTING,4
4,SRL,LINE SEWER (CONTRACTOR),5


In [24]:
# Extract 'service_request_' table
service_request_dim = df[['SVC_REQUEST_NUMBER']].drop_duplicates().rename(
    columns={'SVC_REQUEST_NUMBER': 'ServiceRequestNumber'}
).reset_index(drop=True)
service_request_dim['ServiceRequest_ID'] = range(1, len(service_request_dim) + 1)  # Assign unique IDs

service_request_dim.head()

Unnamed: 0,ServiceRequestNumber,ServiceRequest_ID
0,860449,1
1,168089,2
2,428803,3
3,185634611,4
4,895478,5


In [25]:
# Static Data for 'wo_time_type_' and 'day_of_week_' tables
wo_time_type_dim = pd.DataFrame({
    'TimeType_ID': [1, 2, 3],
    'Time_Type': ['Started', 'Completed', 'Added']
})

wo_time_type_dim.head()

Unnamed: 0,TimeType_ID,Time_Type
0,1,Started
1,2,Completed
2,3,Added


In [26]:
day_of_week_dim = pd.DataFrame({
    'Day_of_week_ID': range(1, 8),
    'DayInWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
})

day_of_week_dim.head()

Unnamed: 0,Day_of_week_ID,DayInWeek
0,1,Monday
1,2,Tuesday
2,3,Wednesday
3,4,Thursday
4,5,Friday


In [44]:
# Create 'work_order_time_' table
# Flatten out the dates from the original DataFrame
work_order_time_started = df[['WorkOrderID', 'WORKORDER_STARTED']].dropna().rename(columns={'WORKORDER_STARTED': 'Time'})
work_order_time_completed = df[['WorkOrderID', 'WORKORDER_COMPLETED']].dropna().rename(columns={'WORKORDER_COMPLETED': 'Time'})
work_order_time_added = df[['WorkOrderID', 'WORKORDER_ADDED']].rename(columns={'WORKORDER_ADDED': 'Time'})

# Add a TimeType_ID (1 for Started, 2 for Completed, 3 for Added)
work_order_time_started['TimeType_ID'] = 1
work_order_time_completed['TimeType_ID'] = 2
work_order_time_added['TimeType_ID'] = 3

# Concatenate these records and extract the date parts
work_order_time = pd.concat([work_order_time_started, work_order_time_completed, work_order_time_added], ignore_index=True)
work_order_time['Year'] = work_order_time['Time'].dt.year
work_order_time['Month'] = work_order_time['Time'].dt.month
work_order_time['Day'] = work_order_time['Time'].dt.day
work_order_time['Hour'] = work_order_time['Time'].dt.hour
work_order_time['Minute'] = work_order_time['Time'].dt.minute
work_order_time['Second'] = work_order_time['Time'].dt.second
work_order_time['Day_of_week_ID'] = work_order_time['Time'].dt.dayofweek + 1  # Assuming 1=Monday, ..., 7=Sunday

# Add a unique Time_ID for each record
work_order_time['Time_ID'] = range(1, len(work_order_time) + 1)

# Select final columns for 'work_order_time_'
work_order_time = work_order_time.rename(columns={'WorkOrderID': 'WorkOrder_ID'})[['WorkOrder_ID', 'Time_ID', 'TimeType_ID', 'Day_of_week_ID', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second']]

print(work_order_time.info())

print(work_order_time.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 535747 entries, 0 to 535746
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   WorkOrder_ID    535747 non-null  int64
 1   Time_ID         535747 non-null  int64
 2   TimeType_ID     535747 non-null  int64
 3   Day_of_week_ID  535747 non-null  int64
 4   Year            535747 non-null  int64
 5   Month           535747 non-null  int64
 6   Day             535747 non-null  int64
 7   Hour            535747 non-null  int64
 8   Minute          535747 non-null  int64
 9   Second          535747 non-null  int64
dtypes: int64(10)
memory usage: 40.9 MB
None
   WorkOrder_ID  Time_ID  TimeType_ID  Day_of_week_ID  Year  Month  Day  Hour  \
0             2        1            1               3  2015      6   10     8   
1             3        2            1               4  2016      2   25     0   
2             4        3            1               2  2016     11   29  

In [45]:
# Create 'work_order_fact' table
# Merge with 'wo_activity_' to get ActivityID
work_order_fact = df.merge(wo_activity_dim, left_on='WORKORDER_ACTIVITY_CODE', right_on='ActivityCode', how='left')
work_order_fact = work_order_fact.merge(service_request_dim, left_on='SVC_REQUEST_NUMBER', right_on='ServiceRequestNumber', how='left')

work_order_fact = work_order_fact.rename(columns={
    'WorkOrderID': 'WorkOrder_ID', 
    'WORKORDER_NUMBER': 'WorkOrderNumber', 
    'ActivityID': 'Activity_ID', 
    'ServiceRequest_ID': 'ServiceRequest_ID'
})[['WorkOrder_ID', 'Activity_ID', 'ServiceRequest_ID', 'WorkOrderNumber']]

# Add columns for Time_IDs in the fact table
work_order_fact = work_order_fact.merge(
    work_order_time[work_order_time['TimeType_ID'] == 1][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'TimeID_Started'}),
    on='WorkOrder_ID', how='left'
)

work_order_fact = work_order_fact.merge(
    work_order_time[work_order_time['TimeType_ID'] == 2][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'TimeID_Completed'}),
    on='WorkOrder_ID', how='left'
)

work_order_fact = work_order_fact.merge(
    work_order_time[work_order_time['TimeType_ID'] == 3][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'TimeID_Added'}),
    on='WorkOrder_ID', how='left'
)

# Final columns for the work_order_fact table, including Time_IDs
work_order_fact = work_order_fact[['WorkOrder_ID', 'Activity_ID', 'ServiceRequest_ID', 'WorkOrderNumber', 'TimeID_Started', 'TimeID_Completed', 'TimeID_Added']]

# Flatten the data by merging work_order_fact with each time type
work_order_fact_flat = pd.concat([
    work_order_fact.merge(
        work_order_time[work_order_time['TimeType_ID'] == 1][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'Time_ID'}),
        on='WorkOrder_ID', how='inner'
    ).assign(TimeType_ID=1),
    
    work_order_fact.merge(
        work_order_time[work_order_time['TimeType_ID'] == 2][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'Time_ID'}),
        on='WorkOrder_ID', how='inner'
    ).assign(TimeType_ID=2),
    
    work_order_fact.merge(
        work_order_time[work_order_time['TimeType_ID'] == 3][['WorkOrder_ID', 'Time_ID']].rename(columns={'Time_ID': 'Time_ID'}),
        on='WorkOrder_ID', how='inner'
    ).assign(TimeType_ID=3)
])

# Select final columns for the flattened work_order_fact
work_order_fact_flat = work_order_fact_flat[['WorkOrder_ID', 'Activity_ID', 'ServiceRequest_ID', 'WorkOrderNumber', 'Time_ID']]

print(work_order_fact.info())
print(work_order_fact_flat.info())

print(work_order_fact.head())
print(work_order_fact_flat.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 196028 entries, 0 to 196027
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   WorkOrder_ID       196028 non-null  int64  
 1   Activity_ID        196028 non-null  int64  
 2   ServiceRequest_ID  196028 non-null  int64  
 3   WorkOrderNumber    196028 non-null  int64  
 4   TimeID_Started     168573 non-null  float64
 5   TimeID_Completed   171146 non-null  float64
 6   TimeID_Added       196028 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 12.0 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 535747 entries, 0 to 196027
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype
---  ------             --------------   -----
 0   WorkOrder_ID       535747 non-null  int64
 1   Activity_ID        535747 non-null  int64
 2   ServiceRequest_ID  535747 non-null  int64
 3   WorkOrderNumber    535747 non-null  int64
 

In [40]:
# Example output of each table:
print("wo_activity:\n", wo_activity_dim.head())
print("\nservice_request:\n", service_request_dim.head())
print("\nwork_order_fact_flat:\n", work_order_fact_flat.head())
print("\nwork_order_time:\n", work_order_time.head())
print("\nwo_time_type:\n", wo_time_type_dim)
print("\nday_of_week:\n", day_of_week_dim)

wo_activity:
   ActivityCode         ActivityDescription  ActivityID
0         SA25           REPAIR CONNECTION           1
1         SA27  PERFORM OTHER MANHOLE WORK           2
2         WA10     RESET/REPLACE VALVE BOX           3
3         SA26     REPLACE MANHOLE CASTING           4
4          SRL     LINE SEWER (CONTRACTOR)           5

service_request:
    ServiceRequestNumber  ServiceRequest_ID
0                860449                  1
1                168089                  2
2                428803                  3
3             185634611                  4
4                895478                  5

work_order_fact_flat:
    WorkOrder_ID  Activity_ID  ServiceRequest_ID  WorkOrderNumber  Time_ID
0             2            3                  3        843089280        1
1             3            4                  4        842762063        2
2             4            5                  5        842882322        3
3             5            6                  6          17

In [39]:
n = len(pd.unique(work_order_fact_flat['WorkOrder_ID']))
 
print("No.of.unique values :", n)

No.of.unique values : 196028
