In [6]:
!pip install pandas 
!pip install numpy 
!pip install faker 


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# Creating the technician table
### This is the first step into creating this synthetic data that will later be used for research and visualisation purposes
#### What is needed:
##### - a unique ID
##### - a full name
##### - a region
##### - an experience level
### _Faker_ will be used to generate names; _random.choice()_ will be used to assign region and experience levels; _pandas_ to store and import the data.

In [23]:
import pandas as pd
import random
from faker import Faker
import numpy as np

In [11]:
# initializing faker and setting up the constrains

fake = Faker()

num_techs = 1000 # Number of techicians to generate
technicians = []

# Defining regions and level of experience
regions = ['North','South','East','West']
levels = ['Junior', 'Mid', 'Senior']

for i in range(1, num_techs + 1):
    tech = { 
        'technician_id' : f'T{i:03}',
        'name': fake.name(),
        'region': random.choice(regions),
        'experience_level': random.choice(levels)
    }
    technicians.append(tech)
df_techs = pd.DataFrame(technicians)
df_techs.to_csv("technicians.csv", index = False)

# Table for the Machines
## This table includes 
### - a unique Machine ID
### - Machine type (such as Pump, Press, Conveyor)
### - Installation date
### - Location (region)
### - Status ( Active/ Inactive)

In [15]:
num_machines = 5000
machine_types = ['Pump', 
               'Press', 
               'Conveyor', 
               'Welder',
               'CNC-Machine',
               'Compressor',
               'Lathe',
               '3D Print',
               'Packaging Unit',
               'Drill',
               'Mixer',
               'Laser Cutter',
               'Grinder',
               'Furnace'
               ]
statuses= ['Active', 'Inactive']

In [19]:
# creating the machine records

machines = []

for i in range (1, num_machines + 1):
    machine = {
        'machine_id': f'M{i:03}',
        'type': random.choice(machine_types),
        'install_date': fake.date_between (start_date = '-3y', end_date = '-6m'),
        'region': random.choice(regions),
        'status': random.choice(statuses)
    }

    machines.append(machine)
    
df_machines = pd.DataFrame(machines)
df_machines.to_csv("machines.csv", index = False)

# Service Calls Table
## This table links one technician (from technicians.csv) to one machine from (machines.csv), a date an issue type and a duration (in hours)
### This Table will contain:
#### - call_id: unique identifier
#### - date: when the service occurred
#### - technician_id: who did the job (FK)
#### - machine_id: which machine (FK)
#### - issue_type: what kind of problem occurred
#### - duration_hours: time it took to fix (float)

In [20]:
num_calls = 10000

issue_types = [
    'Routine Maintenance',
    'Emergency Repair',
    'Calibration',
    'Electrical Fault',
    'Mechanical Failure',
    'Inspection',
    'Sensor Replacement',
    'Software Update'
]
# Create a date range of possible service dates
date_range = pd.date_range (start = '2023-01-01', end = '2024-06-30').to_list()

In [24]:
calls = []

for i in range(1, num_calls + 1):
    call={
        'call_id':f'C{i:05}',
        'date': random.choice(date_range),
        'technician_id': random.choice(df_techs['technician_id'].tolist()),
        'machine_id': random.choice(df_machines['machine_id'].tolist()),
        'issue_type': random.choice(issue_types),
        'duration_hours': round(np.random.exponential(scale=2.5), 2) #an average of 2.5h fix time
    }
    calls.append(call)

df_calls= pd.DataFrame(calls)
df_calls.to_csv("service_calls.csv", index = False)