In [1]:
import xgboost as xgb
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/kaggle/input/oil-n-gas-maintenance/df_final.csv')
df.columns

Index(['Timestamp', 'Equipment', 'Category', 'Brand', 'Model', 'Location Type',
       'Water Zone', 'Formation Type', 'Drilling Depth (m)',
       'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
       'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
       'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
       'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
       'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
       'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
       'Observed Symptom'],
      dtype='object')

In [9]:
print(len(df.columns)), df.columns
df = df.drop(columns=["Timestamp"])

30


In [10]:
clean = df['Replaced Parts'].str.replace(r',\s*', ',', regex=True)

# 2) then get_dummies on the literal comma
dummies = clean.str.get_dummies(sep=',')

# 3) optional: strip any whitespace from the column names
dummies.columns = dummies.columns.str.strip()

# 4) join back to your original DataFrame
df_new = pd.concat([df, dummies], axis=1)
len(df_new.columns), df_new.columns

(34,
 Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
        'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
        'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
        'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
        'Observed Symptom', 'Bearing', 'Compressor', 'Filter', 'Heat Exchanger',
        'Valve'],
       dtype='object'))

In [15]:
similarity_cols = [
    'Drilling Depth (m)',
    'Formation Pressure (bar)',
    'Temperature (°C)',
    'Pump Pressure (bar)',
    'Vibration (mm/s)',
    'Operating Hours'
]

X = df_new.drop('Failure Class', axis=1)
y = df_new['Failure Class']

categorical_cols = X.select_dtypes(include='object').columns
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier()
model.load_model("/kaggle/input/xgboost-dl-project/pytorch/default/1/best_xgb.json") 

In [16]:
len(X['Equipment'].unique())

9

In [17]:
df_new.columns, df_new.shape, X_test.columns, X_test.shape

(Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
        'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
        'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
        'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
        'Observed Symptom', 'Bearing', 'Compressor', 'Filter', 'Heat Exchanger',
        'Valve'],
       dtype='object'),
 (876264, 34),
 Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)

In [18]:
test_df = X_test.copy()
y_predict = model.predict(X_test)

scaler = StandardScaler()
test_df_scaled = scaler.fit_transform(test_df[similarity_cols])

In [19]:
def recommend_safe_equipment(input_conditions, top_k=5):
    """
    Recommend equipment from the test set operating under similar conditions predicted as safe.
    """
    input_df = pd.DataFrame([input_conditions])
    input_scaled = scaler.transform(input_df)

    # Compute cosine similarity with test data
    similarities = cosine_similarity(input_scaled, test_df_scaled).flatten()

    # Create result DataFrame
    result_df = test_df.copy()
    result_df["Similarity"] = similarities

    # Filter only predicted safe equipment
    safe_df = result_df[result_df["Failure Cause"] == 0]

    # Sort by similarity
    recommended = safe_df.sort_values(by="Similarity", ascending=False).head(top_k)

    return recommended[[
        'Equipment', 'Similarity'
    ]]
    # 'Category', 'Brand', 'Model',
    #     'Temperature (°C)', 'Pressure (bar)', 'Drilling Depth (m)', 'Mud Viscosity (cP)',
    #     'Predicted_Failure'

In [20]:
input_example = {
    'Drilling Depth (m)': 3100,
    'Formation Pressure (bar)': 250,
    'Temperature (°C)': 105,
    'Pump Pressure (bar)': 60,
    'Vibration (mm/s)': 1.5,
    'Operating Hours': 20
}

recommendations = recommend_safe_equipment(input_example)

In [21]:
for index, row in recommendations.iterrows():
    print(row.to_dict())

{'Equipment': 3.0, 'Similarity': 0.970195860609422}
{'Equipment': 0.0, 'Similarity': 0.9696203925497728}
{'Equipment': 3.0, 'Similarity': 0.9686891330483423}
{'Equipment': 8.0, 'Similarity': 0.9594613992600198}
{'Equipment': 0.0, 'Similarity': 0.9578297416576291}
