In [1]:
import random
import pandas as pd
import numpy as np

In [2]:
equipment_names = [
    'Land-based Rotary Rig', 'Jack-up Rig', 'Semi-submersible Rig', 'Drillship',
    'PDC Drill Bit', 'Tricone Drill Bit', 'Mud Motor', 'Mud Pump',
    'Shale Shaker', 'Desander', 'Degasser', 'Blowout Preventer',
    'Top Drive', 'Casing Head', 'Cementing Unit', 'Coiled Tubing Unit',
    'Workover Rig', 'Wireline Unit', 'Subsea Tree', 'Christmas Tree',
    'Wellhead', 'Flowline', 'Separator', 'Heater Treater', 'Storage Tank',
    'Gas Compressor', 'Gas Dehydrator', 'Well Control Panel', 'Accumulator Unit',
    'Annular Preventer', 'Choke Manifold', 'Kill Line', 'Standpipe', 'Drill Collar',
    'Kelly Drive', 'Rotary Table', 'Draw Works', 'Swivel', 'Crown Block',
    'Travelling Block', 'Derrick', 'Mud Gas Separator', 'Cuttings Dryer', 'Desilter',
    'Trip Tank', 'Cement Silo', 'Surge Tank', 'Mixing Hopper', 'Well Logging Tool',
    'Formation Tester'
]

def generate_equipment():
    return [{'ID': i, 'Name': equipment_names[i-1]} for i in range(1, len(equipment_names)+1)]

# ————— Equipment Specifications ————— #
equipment_specs = pd.DataFrame([
    {'Equipment': 'Land-based Rotary Rig', 'Category': 'Rig', 'Brand': 'RigCo', 'Model': 'LR-100',
     'Max Depth Rating (m)': 5000, 'Max Pressure Rating (bar)': 500, 'Max Temperature Rating (°C)': 180,
     'Torque Capacity (kN·m)': 300, 'Hook Load Capacity (t)': 200, 'Daily Rate (USD)': 50000,
     'Lead Time (days)': 14, 'In Stock': True},
    {'Equipment': 'Jack-up Rig', 'Category': 'Rig', 'Brand': 'OffshoreInc', 'Model': 'JU-45',
     'Max Depth Rating (m)': 300, 'Max Pressure Rating (bar)': 350, 'Max Temperature Rating (°C)': 160,
     'Torque Capacity (kN·m)': 250, 'Hook Load Capacity (t)': 150, 'Daily Rate (USD)': 60000,
     'Lead Time (days)': 21, 'In Stock': False},
    {'Equipment': 'Semi-submersible Rig', 'Category': 'Rig', 'Brand': 'SubSeaWorks', 'Model': 'SSR-300',
     'Max Depth Rating (m)': 1500, 'Max Pressure Rating (bar)': 450, 'Max Temperature Rating (°C)': 170,
     'Torque Capacity (kN·m)': 280, 'Hook Load Capacity (t)': 180, 'Daily Rate (USD)': 75000,
     'Lead Time (days)': 10, 'In Stock': True},
    {'Equipment': 'Drillship', 'Category': 'Rig', 'Brand': 'OceanDrill', 'Model': 'DS-900',
     'Max Depth Rating (m)': 3500, 'Max Pressure Rating (bar)': 500, 'Max Temperature Rating (°C)': 190,
     'Torque Capacity (kN·m)': 320, 'Hook Load Capacity (t)': 220, 'Daily Rate (USD)': 95000,
     'Lead Time (days)': 7, 'In Stock': True},
    {'Equipment': 'PDC Drill Bit', 'Category': 'Drill Bit', 'Brand': 'BitCo', 'Model': 'PDCX-200',
     'Max Depth Rating (m)': 4000, 'Max Pressure Rating (bar)': 300, 'Max Temperature Rating (°C)': 200,
     'Torque Capacity (kN·m)': 150, 'Hook Load Capacity (t)': 50, 'Daily Rate (USD)': 1200,
     'Lead Time (days)': 5, 'In Stock': True},
    {'Equipment': 'Tricone Drill Bit', 'Category': 'Drill Bit', 'Brand': 'RockDrill', 'Model': 'TRC-150',
     'Max Depth Rating (m)': 3000, 'Max Pressure Rating (bar)': 250, 'Max Temperature Rating (°C)': 160,
     'Torque Capacity (kN·m)': 140, 'Hook Load Capacity (t)': 45, 'Daily Rate (USD)': 1000,
     'Lead Time (days)': 3, 'In Stock': True},
    {'Equipment': 'Mud Motor', 'Category': 'Motor', 'Brand': 'FlowTech', 'Model': 'MM-50',
     'Max Depth Rating (m)': 2000, 'Max Pressure Rating (bar)': 200, 'Max Temperature Rating (°C)': 180,
     'Torque Capacity (kN·m)': 90, 'Hook Load Capacity (t)': 30, 'Daily Rate (USD)': 800,
     'Lead Time (days)': 7, 'In Stock': True},
    {'Equipment': 'Mud Pump', 'Category': 'Pump', 'Brand': 'PumpMaster', 'Model': 'MP-300',
     'Max Depth Rating (m)': 2500, 'Max Pressure Rating (bar)': 350, 'Max Temperature Rating (°C)': 150,
     'Torque Capacity (kN·m)': 110, 'Hook Load Capacity (t)': 40, 'Daily Rate (USD)': 2000,
     'Lead Time (days)': 10, 'In Stock': True},
    {'Equipment': 'Shale Shaker', 'Category': 'Separator', 'Brand': 'ShakePro', 'Model': 'SS-25',
     'Max Depth Rating (m)': 1000, 'Max Pressure Rating (bar)': 100, 'Max Temperature Rating (°C)': 120,
     'Torque Capacity (kN·m)': 0, 'Hook Load Capacity (t)': 0, 'Daily Rate (USD)': 500,
     'Lead Time (days)': 2, 'In Stock': True},
    # ... include all other equipment entries similarly ...
])

NON_FAILURE_SYMPTOMS = [
    'Abnormal vibration','Pressure loss','Lack of pressure',
    'Fluid leakage','Oil flow anomaly'
]
ALL_SYMPTOMS = NON_FAILURE_SYMPTOMS + ['Stopped working']

# Date generator
def generate_dates(start, end):
    return pd.date_range(start, end, freq='D')

# Helper for water zone categorization
def _assign_zone(loc, wd):
    if loc == 'Onshore': return 'Onshore'
    if wd < 100: return 'Negligible'
    if wd < 300: return 'Shallow'
    if wd < 600: return 'Mid-shallow'
    if wd < 1000: return 'Mid'
    if wd < 1500: return 'Mid-deep'
    if wd < 2000: return 'Deep'
    return 'Very deep'

# Main generator with balancing

def generate_data(start: str, end: str, min_per_day: int = 5, max_per_day: int = 15) -> pd.DataFrame:
    rows = []
    equips = equipment_specs['Equipment'].tolist()

    for day in generate_dates(start, end):
        for _ in range(random.randint(min_per_day, max_per_day)):
            ts = day + pd.Timedelta(seconds=random.uniform(0, 24*3600))
            eq = random.choice(equips)
            spec = equipment_specs.loc[equipment_specs['Equipment'] == eq].iloc[0]

            # Simulate operating conditions
            loc = random.choice(['Onshore','Offshore'])
            wd = random.uniform(50,2000) if loc=='Offshore' else np.nan
            zone = _assign_zone(loc, wd)
            formation = random.choice(['Sandstone','Shale','Limestone','Granite','Dolomite','Basalt','Soft clay'])
            depth_m = random.uniform(500,6000)
            formation_pres = random.uniform(50,500)
            mud_wt = random.uniform(8.5,18)
            mud_visc = random.uniform(10,40)
            pump_pres = random.uniform(20,100)
            pump_flow = random.uniform(200,600)
            rop = random.uniform(5,60)
            hook_load = random.uniform(100,500)
            torque = random.uniform(50,300)
            maint_type = random.choice(['Preventive','Corrective'])
            temp = random.uniform(70,130)
            pres = random.uniform(8,16)
            vib = random.uniform(1,3)
            op_hours = random.randint(16,24)
            parts_list = random.sample(['Bearing','Valve','Filter','Heat Exchanger','Compressor'], k=random.randint(1,3))
            replaced_parts = ', '.join(parts_list)
            fail_cause = random.choice(['Natural wear','Electrical failure','Leakage','Mechanical issue','N/A'])
            comp_part = random.choice(['Bearing','Oil Filter','Pressure Valve','Heat Exchanger','Flow Control Valve'])

            # Risk scoring
            base_flags = [
                depth_m > 3000,
                formation_pres > 300,
                formation in ['Shale','Basalt'],
                mud_wt < 9 or mud_wt > 17,
                mud_visc > 35,
                pump_pres > 80,
                pump_flow < 250 or pump_flow > 550,
                rop > 50,
                hook_load > 450,
                torque > 250,
                maint_type == 'Corrective',
                temp > 110,
                pres < 9,
                vib > 2.5,
                op_hours > 22,
                len(parts_list) > 2,
                fail_cause in ['Electrical failure','Leakage'],
                comp_part in ['Bearing','Pressure Valve']
            ]
            spec_flags = [
                depth_m > spec['Max Depth Rating (m)'],
                pres > spec['Max Pressure Rating (bar)'],
                temp > spec['Max Temperature Rating (°C)'],
                torque > spec['Torque Capacity (kN·m)'],
                hook_load > spec['Hook Load Capacity (t)']
            ]
            total_score = sum(base_flags) + sum(spec_flags)
            p_fail = 1 / (1 + np.exp(-(total_score - 5) / 3))
            failure = int(random.random() < p_fail)
            symptom = random.choice(ALL_SYMPTOMS if failure else NON_FAILURE_SYMPTOMS)

            rows.append({
                'Timestamp': ts,
                'Equipment': eq,
                'Category': spec['Category'],
                'Brand': spec['Brand'],
                'Model': spec['Model'],
                'Location Type': loc,
                'Water Zone': zone,
                'Formation Type': formation,
                'Drilling Depth (m)': depth_m,
                'Formation Pressure (bar)': formation_pres,
                'Mud Weight (ppg)': mud_wt,
                'Mud Viscosity (cP)': mud_visc,
                'Pump Pressure (bar)': pump_pres,
                'Pump Flow (L/min)': pump_flow,
                'ROP (m/hr)': rop,
                'Hook Load (t)': hook_load,
                'Torque (kN·m)': torque,
                'Maintenance Type': maint_type,
                'Temperature (°C)': temp,
                'Pressure (bar)': pres,
                'Vibration (mm/s)': vib,
                'Operating Hours': op_hours,
                'Replaced Parts': replaced_parts,
                'Failure Cause': fail_cause,
                'Part': comp_part,
                'Daily Rate (USD)': spec['Daily Rate (USD)'],
                'Lead Time (days)': spec['Lead Time (days)'],
                'In Stock': spec['In Stock'],
                'Failure Class': failure,
                'Observed Symptom': symptom
            })

    df = pd.DataFrame(rows)
    # Balance classes
    df0 = df[df['Failure Class'] == 0]
    df1 = df[df['Failure Class'] == 1]
    n_min = min(len(df0), len(df1))
    df_bal = pd.concat([
        df0.sample(n_min, random_state=42),
        df1.sample(n_min, random_state=42)
    ])
    return df_bal.sample(frac=1, random_state=42).reset_index(drop=True)

In [3]:
# Generate the equipment list
equipment_list = generate_equipment()
equipment_list

[{'ID': 1, 'Name': 'Land-based Rotary Rig'},
 {'ID': 2, 'Name': 'Jack-up Rig'},
 {'ID': 3, 'Name': 'Semi-submersible Rig'},
 {'ID': 4, 'Name': 'Drillship'},
 {'ID': 5, 'Name': 'PDC Drill Bit'},
 {'ID': 6, 'Name': 'Tricone Drill Bit'},
 {'ID': 7, 'Name': 'Mud Motor'},
 {'ID': 8, 'Name': 'Mud Pump'},
 {'ID': 9, 'Name': 'Shale Shaker'},
 {'ID': 10, 'Name': 'Desander'},
 {'ID': 11, 'Name': 'Degasser'},
 {'ID': 12, 'Name': 'Blowout Preventer'},
 {'ID': 13, 'Name': 'Top Drive'},
 {'ID': 14, 'Name': 'Casing Head'},
 {'ID': 15, 'Name': 'Cementing Unit'},
 {'ID': 16, 'Name': 'Coiled Tubing Unit'},
 {'ID': 17, 'Name': 'Workover Rig'},
 {'ID': 18, 'Name': 'Wireline Unit'},
 {'ID': 19, 'Name': 'Subsea Tree'},
 {'ID': 20, 'Name': 'Christmas Tree'},
 {'ID': 21, 'Name': 'Wellhead'},
 {'ID': 22, 'Name': 'Flowline'},
 {'ID': 23, 'Name': 'Separator'},
 {'ID': 24, 'Name': 'Heater Treater'},
 {'ID': 25, 'Name': 'Storage Tank'},
 {'ID': 26, 'Name': 'Gas Compressor'},
 {'ID': 27, 'Name': 'Gas Dehydrator'},


In [4]:
# Example: between 5 and 15 random samples per day
data_df = generate_data('2024-05-01', '2025-05-01', min_per_day=3000, max_per_day=4000)
data_df

Unnamed: 0,Timestamp,Equipment,Category,Brand,Model,Location Type,Water Zone,Formation Type,Drilling Depth (m),Formation Pressure (bar),...,Vibration (mm/s),Operating Hours,Replaced Parts,Failure Cause,Part,Daily Rate (USD),Lead Time (days),In Stock,Failure Class,Observed Symptom
0,2024-05-08 08:06:17.814980420,Mud Motor,Motor,FlowTech,MM-50,Onshore,Onshore,Sandstone,1135.961362,427.261027,...,2.612195,21,"Bearing, Compressor, Heat Exchanger",Mechanical issue,Flow Control Valve,800,7,True,0,Pressure loss
1,2025-03-01 04:53:47.781161781,Semi-submersible Rig,Rig,SubSeaWorks,SSR-300,Onshore,Onshore,Limestone,4957.834839,131.721585,...,2.170569,22,Heat Exchanger,Natural wear,Oil Filter,75000,10,True,0,Oil flow anomaly
2,2024-10-12 13:10:32.519667170,Land-based Rotary Rig,Rig,RigCo,LR-100,Onshore,Onshore,Shale,769.254194,485.448051,...,2.018211,16,Filter,Electrical failure,Flow Control Valve,50000,14,True,0,Fluid leakage
3,2024-07-04 08:03:06.777571181,Tricone Drill Bit,Drill Bit,RockDrill,TRC-150,Offshore,Negligible,Granite,2673.449908,62.721562,...,1.320961,17,"Heat Exchanger, Valve, Compressor",,Heat Exchanger,1000,3,True,1,Lack of pressure
4,2024-06-10 09:26:09.972010656,Mud Pump,Pump,PumpMaster,MP-300,Offshore,Mid-shallow,Basalt,1273.584161,71.246565,...,1.145536,20,Compressor,Leakage,Heat Exchanger,2000,10,True,1,Pressure loss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
878639,2024-08-12 23:13:07.481714981,Mud Motor,Motor,FlowTech,MM-50,Offshore,Mid-shallow,Limestone,959.312486,489.736105,...,2.198884,17,"Bearing, Heat Exchanger",Mechanical issue,Pressure Valve,800,7,True,0,Lack of pressure
878640,2025-04-20 21:06:53.500063338,Land-based Rotary Rig,Rig,RigCo,LR-100,Offshore,Deep,Limestone,1691.841286,332.776652,...,2.797736,23,"Filter, Heat Exchanger, Valve",Natural wear,Heat Exchanger,50000,14,True,0,Lack of pressure
878641,2024-09-04 23:12:52.413330379,Mud Motor,Motor,FlowTech,MM-50,Onshore,Onshore,Granite,2947.053639,372.133877,...,2.247256,18,Compressor,Electrical failure,Bearing,800,7,True,0,Lack of pressure
878642,2025-03-14 08:18:30.798608049,Land-based Rotary Rig,Rig,RigCo,LR-100,Offshore,Shallow,Dolomite,3156.874161,272.261662,...,1.349463,18,Filter,Natural wear,Heat Exchanger,50000,14,True,1,Stopped working


In [5]:
data_df.duplicated().sum()

0

In [6]:
data_df['Failure Class'].value_counts(normalize=True)

Failure Class
0    0.5
1    0.5
Name: proportion, dtype: float64

In [7]:
data_df.isnull().sum()

Timestamp                   0
Equipment                   0
Category                    0
Brand                       0
Model                       0
Location Type               0
Water Zone                  0
Formation Type              0
Drilling Depth (m)          0
Formation Pressure (bar)    0
Mud Weight (ppg)            0
Mud Viscosity (cP)          0
Pump Pressure (bar)         0
Pump Flow (L/min)           0
ROP (m/hr)                  0
Hook Load (t)               0
Torque (kN·m)               0
Maintenance Type            0
Temperature (°C)            0
Pressure (bar)              0
Vibration (mm/s)            0
Operating Hours             0
Replaced Parts              0
Failure Cause               0
Part                        0
Daily Rate (USD)            0
Lead Time (days)            0
In Stock                    0
Failure Class               0
Observed Symptom            0
dtype: int64

In [8]:
data_df.nunique()

Timestamp                   878644
Equipment                        9
Category                         5
Brand                            9
Model                            9
Location Type                    2
Water Zone                       7
Formation Type                   7
Drilling Depth (m)          878644
Formation Pressure (bar)    878644
Mud Weight (ppg)            878644
Mud Viscosity (cP)          878644
Pump Pressure (bar)         878644
Pump Flow (L/min)           878644
ROP (m/hr)                  878644
Hook Load (t)               878644
Torque (kN·m)               878644
Maintenance Type                 2
Temperature (°C)            878644
Pressure (bar)              878644
Vibration (mm/s)            878644
Operating Hours                  9
Replaced Parts                  85
Failure Cause                    5
Part                             5
Daily Rate (USD)                 9
Lead Time (days)                 7
In Stock                         2
Failure Class       

In [9]:
data_df.to_csv("data_df", index=False)