In [32]:
import pandas as pd
import numpy as np

df = pd.read_csv("work-order-management-module.csv")

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206058 entries, 0 to 206057
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype 
---  ------                          --------------   ----- 
 0   SVC_REQUEST_NUMBER              206058 non-null  int64 
 1   WORKORDER_NUMBER                206058 non-null  int64 
 2   WORKORDER_ACTIVITY_CODE         206045 non-null  object
 3   WORKORDER_ACTIVITY_DESCRIPTION  206045 non-null  object
 4   WORKORDER_STARTED               176702 non-null  object
 5   WORKORDER_COMPLETED             179272 non-null  object
 6   WORKORDER_ADDED                 206058 non-null  object
 7   TIME_STAMP                      206058 non-null  object
dtypes: int64(2), object(6)
memory usage: 12.6+ MB


In [34]:
df.head()

Unnamed: 0,SVC_REQUEST_NUMBER,WORKORDER_NUMBER,WORKORDER_ACTIVITY_CODE,WORKORDER_ACTIVITY_DESCRIPTION,WORKORDER_STARTED,WORKORDER_COMPLETED,WORKORDER_ADDED,TIME_STAMP
0,860449,895401,SA25,REPAIR CONNECTION,,,2004-07-16T10:00:04.000,2015-07-07T00:45:01.000
1,168089,100322,SA27,PERFORM OTHER MANHOLE WORK,,,2000-06-01T13:20:07.000,2015-06-29T00:40:55.000
2,428803,843089280,WA10,RESET/REPLACE VALVE BOX,2015-06-10T08:20:00.000,2015-06-10T11:00:00.000,2002-04-08T23:03:19.000,2015-06-16T00:45:01.000
3,185634611,842762063,SA26,REPLACE MANHOLE CASTING,2016-02-25T00:45:00.000,2016-02-25T03:00:00.000,2013-09-03T09:13:14.000,2016-02-26T00:45:03.000
4,895478,842882322,SRL,LINE SEWER (CONTRACTOR),2016-11-29T00:00:00.000,2016-12-13T00:00:00.000,2014-04-23T13:47:23.000,2017-01-10T00:45:01.000


In [35]:
# Cleaning the Data

# ---------------------------------------------------------------------------------------

# Removing Columns not useful for analysis

# The 'TIME_STAMP' column is not included in the analysis.
# It only shows the date that the data was exported from the Client's ERP to excel which is not relevant for our project.

df = df.drop(columns=['TIME_STAMP'])


# ---------------------------------------------------------------------------------------

# Out of Range Datetime Values

# Capture and export invalid out of range dates in Datetime Columns

def get_out_of_range_datetimes(time_description):
    invalid_dates = pd.DataFrame()

    for element in time_description:
        # where to_datetime fails. 
        # dt means datetime
        not_dt = pd.to_datetime(df[element], errors='coerce')

        # where column is not null and to_datetime method fails. 
        # ofr means out of range
        ofr_dt = not_dt.isna() & df[element].notnull()
        
        # Important to do the previous step as there are several blank rows for the Datetime,
        # which makes sense because the Work order may not have been started and/or
        # completed at the time of processing the data. 
        # So we are looking for 'not null' rows that are also incorrect datetimes.
        
        ofr_dt_ = df[[element, "WORKORDER_NUMBER"]].loc[ofr_dt == True]
        ofr_dt_ = pd.DataFrame(ofr_dt_)
        ofr_dt_ = ofr_dt_.assign(Time_type = element)
        ofr_dt_ = ofr_dt_.rename(columns={element: "Wrong_Datetimes"})
        
        invalid_dates = pd.concat([invalid_dates, ofr_dt_])
        
    return invalid_dates

date_columns = ['WORKORDER_STARTED', 'WORKORDER_COMPLETED', 'WORKORDER_ADDED']

cleaning_export_wrong_dates = get_out_of_range_datetimes(date_columns)

if not cleaning_export_wrong_dates.empty:
    cleaning_export_wrong_dates.to_csv('cleaning_export_wrong_dates.csv')

    
    
# Converting the out of range values and blank rows to NA values

# Attempt to infer format of each date, and return NA for rows where conversion failed
for element in date_columns:
    df[element] = pd.to_datetime(df[element], infer_datetime_format=True, errors = 'coerce') 

    
# ---------------------------------------------------------------------------------------

# Data Type Constraints

# Enforce WORKORDER_ACTIVITY_CODE and WORKORDER_ACTIVITY_DESCRIPTION to 'String' type

df['WORKORDER_ACTIVITY_CODE'] = df['WORKORDER_ACTIVITY_CODE'].astype('str')
df['WORKORDER_ACTIVITY_DESCRIPTION'] = df['WORKORDER_ACTIVITY_DESCRIPTION'].astype('str')

# String length constraints on WORKORDER_ACTIVITY_CODE and WORKORDER_ACTIVITY_DESCRIPTION
# Truncate the specified column to specific length of characters
df['WORKORDER_ACTIVITY_CODE'] = df['WORKORDER_ACTIVITY_CODE'].str.slice(stop=12)
df['WORKORDER_ACTIVITY_DESCRIPTION'] = df['WORKORDER_ACTIVITY_DESCRIPTION'].str.slice(stop=300)

# Assert the data type of WORKORDER_NUMBER is int64
assert df['WORKORDER_NUMBER'].dtype == 'int64', "WORKORDER_NUMBER should be int64"

# Assert the data type of SVC_REQUEST_NUMBER is int64
assert df['SVC_REQUEST_NUMBER'].dtype == 'int64', "SVC_REQUEST_NUMBER should be int64"

# Assert the data type of WORKORDER_STARTED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_STARTED']), "WORKORDER_STARTED should be datetime64"

# Assert the data type of WORKORDER_COMPLETED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_COMPLETED']), "WORKORDER_COMPLETED should be datetime64"

# Assert the data type of WORKORDER_ADDED is datetime64
assert pd.api.types.is_datetime64_any_dtype(df['WORKORDER_ADDED']), "WORKORDER_ADDED should be datetime64"

# Assert the data type of WORKORDER_ACTIVITY_CODE is object (string)
assert df['WORKORDER_ACTIVITY_CODE'].dtype == 'object', "WORKORDER_ACTIVITY_CODE should be object (string)"

# Assert the data type of WORKORDER_ACTIVITY_DESCRIPTION is object (string)
assert df['WORKORDER_ACTIVITY_DESCRIPTION'].dtype == 'object', "WORKORDER_ACTIVITY_DESCRIPTION should be object (string)"


# ---------------------------------------------------------------------------------------

# Removing Duplicate Data

# Drop duplicates based on 'WORKORDER_NUMBER' column and reset the index
df = df.drop_duplicates(subset=['WORKORDER_NUMBER']).reset_index(drop=True)
# Create a new column called 'WorkOrderID' for the index
df['WorkOrderID'] = df.index
# Rearrange the column order
df = df[['WorkOrderID', 
         'WORKORDER_NUMBER', 
         'WORKORDER_ACTIVITY_CODE', 
         'WORKORDER_ACTIVITY_DESCRIPTION', 
         'SVC_REQUEST_NUMBER', 
         'WORKORDER_STARTED', 
         'WORKORDER_COMPLETED', 
         'WORKORDER_ADDED']]

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196028 entries, 0 to 196027
Data columns (total 8 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   WorkOrderID                     196028 non-null  int64         
 1   WORKORDER_NUMBER                196028 non-null  int64         
 2   WORKORDER_ACTIVITY_CODE         196028 non-null  object        
 3   WORKORDER_ACTIVITY_DESCRIPTION  196028 non-null  object        
 4   SVC_REQUEST_NUMBER              196028 non-null  int64         
 5   WORKORDER_STARTED               168573 non-null  datetime64[ns]
 6   WORKORDER_COMPLETED             171146 non-null  datetime64[ns]
 7   WORKORDER_ADDED                 196028 non-null  datetime64[ns]
dtypes: datetime64[ns](3), int64(3), object(2)
memory usage: 12.0+ MB


In [37]:
# Extract unique Activity data
activity_df = df[['WORKORDER_ACTIVITY_CODE', 'WORKORDER_ACTIVITY_DESCRIPTION']].drop_duplicates().rename(
    columns={'WORKORDER_ACTIVITY_CODE': 'ActivityCode', 'WORKORDER_ACTIVITY_DESCRIPTION': 'ActivityDescription'}
)
activity_df['ActivityID'] = range(1, len(activity_df) + 1)  # Assign unique IDs

# Extract unique Service Request data
service_request_df = df[['SVC_REQUEST_NUMBER']].drop_duplicates().rename(
    columns={'SVC_REQUEST_NUMBER': 'ServiceRequestNumber'}
)
service_request_df['ServiceRequest_ID'] = range(1, len(service_request_df) + 1)  # Assign unique IDs

# Map Activity and Service Request IDs back to the main DataFrame
df = df.merge(activity_df, left_on='WORKORDER_ACTIVITY_CODE', right_on='ActivityCode', how='left')
df = df.merge(service_request_df, left_on='SVC_REQUEST_NUMBER', right_on='ServiceRequestNumber', how='left')

# Function to extract time components
def extract_time_components(datetime_series):
    return pd.DataFrame({
        'Year': datetime_series.dt.year,
        'Quarter': datetime_series.dt.quarter,
        'Month': datetime_series.dt.month,
        'Day_of_Week': datetime_series.dt.dayofweek,
        'Day': datetime_series.dt.day,
        'Hour': datetime_series.dt.hour,
        'Minute': datetime_series.dt.minute
    })

# Create time component DataFrames for started, completed, and added times
df_started_components = extract_time_components(df['WORKORDER_STARTED'])
df_completed_components = extract_time_components(df['WORKORDER_COMPLETED'])
df_added_components = extract_time_components(df['WORKORDER_ADDED'])

# Unique Started, Completed, and Added tables with IDs
started_df = df_started_components.drop_duplicates().reset_index(drop=True)
started_df['Started_ID'] = range(1, len(started_df) + 1)

completed_df = df_completed_components.drop_duplicates().reset_index(drop=True)
completed_df['Completed_ID'] = range(1, len(completed_df) + 1)

added_df = df_added_components.drop_duplicates().reset_index(drop=True)
added_df['Added_ID'] = range(1, len(added_df) + 1)

# Merge Started, Completed, and Added IDs back to the main DataFrame using time components
df = df.merge(
    df_started_components.merge(started_df, on=['Year', 'Quarter', 'Month', 'Day_of_Week', 'Day', 'Hour', 'Minute'], how='left'),
    left_index=True, right_index=True, suffixes=('', '_y')
).drop(columns=[col for col in df.columns if '_y' in col])

df = df.merge(
    df_completed_components.merge(completed_df, on=['Year', 'Quarter', 'Month', 'Day_of_Week', 'Day', 'Hour', 'Minute'], how='left'),
    left_index=True, right_index=True, suffixes=('', '_y')
).drop(columns=[col for col in df.columns if '_y' in col])

df = df.merge(
    df_added_components.merge(added_df, on=['Year', 'Quarter', 'Month', 'Day_of_Week', 'Day', 'Hour', 'Minute'], how='left'),
    left_index=True, right_index=True, suffixes=('', '_y')
).drop(columns=[col for col in df.columns if '_y' in col])

# Create the work_order_fact table
work_order_fact_df = df[['WorkOrderID', 'ActivityID', 'ServiceRequest_ID', 'Started_ID', 'Completed_ID', 'Added_ID', 'WORKORDER_NUMBER']].rename(
    columns={
        'WorkOrderID': 'WorkOrder_ID',
        'ActivityID': 'Activity_ID',
        'WORKORDER_NUMBER': 'WorkOrderNumber'
    }
)

# Print sample DataFrames for verification
print("Activity Table:\n", activity_df.head())
print("Service Request Table:\n", service_request_df.head())
print("Started Table:\n", started_df.head())
print("Completed Table:\n", completed_df.head())
print("Added Table:\n", added_df.head())
print("Work Order Fact Table:\n", work_order_fact_df.head())

Activity Table:
   ActivityCode         ActivityDescription  ActivityID
0         SA25           REPAIR CONNECTION           1
1         SA27  PERFORM OTHER MANHOLE WORK           2
2         WA10     RESET/REPLACE VALVE BOX           3
3         SA26     REPLACE MANHOLE CASTING           4
4          SRL     LINE SEWER (CONTRACTOR)           5
Service Request Table:
    ServiceRequestNumber  ServiceRequest_ID
0                860449                  1
1                168089                  2
2                428803                  3
3             185634611                  4
4                895478                  5
Started Table:
      Year  Quarter  Month  Day_of_Week   Day  Hour  Minute  Started_ID
0     NaN      NaN    NaN          NaN   NaN   NaN     NaN           1
1  2015.0      2.0    6.0          2.0  10.0   8.0    20.0           2
2  2016.0      1.0    2.0          3.0  25.0   0.0    45.0           3
3  2016.0      4.0   11.0          1.0  29.0   0.0     0.0           4
