In [15]:
import xgboost as xgb
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv(r'C:\Users\jinay\OneDrive\Documents\VSCodes\Seismic Seekers\data_df.csv')
df.columns

Index(['Timestamp', 'Equipment', 'Category', 'Brand', 'Model', 'Location Type',
       'Water Zone', 'Formation Type', 'Drilling Depth (m)',
       'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
       'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
       'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
       'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
       'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
       'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
       'Observed Symptom'],
      dtype='object')

In [3]:
print(len(df.columns)), df.columns
df = df.drop(columns=["Timestamp"])

30


In [4]:
clean = df['Replaced Parts'].str.replace(r',\s*', ',', regex=True)

# 2) then get_dummies on the literal comma
dummies = clean.str.get_dummies(sep=',')

# 3) optional: strip any whitespace from the column names
dummies.columns = dummies.columns.str.strip()

# 4) join back to your original DataFrame
df_new = pd.concat([df, dummies], axis=1)
len(df_new.columns), df_new.columns

(34,
 Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
        'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
        'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
        'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
        'Observed Symptom', 'Bearing', 'Compressor', 'Filter', 'Heat Exchanger',
        'Valve'],
       dtype='object'))

In [10]:
similarity_cols = [
    'Drilling Depth (m)',
    'Formation Pressure (bar)',
    'Temperature (°C)',
    'Pump Pressure (bar)',
    'Vibration (mm/s)',
    'Operating Hours'
]

X = df_new.drop(columns={'Failure Class', 'Replaced Parts'}, axis=1)
y = df_new['Failure Class']

categorical_cols = X.select_dtypes(include='object').columns
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier()
model.load_model(r'C:\Users\jinay\OneDrive\Documents\VSCodes\Seismic Seekers\best_xgb.json') 

In [11]:
len(X['Equipment'].unique())

50

In [12]:
df_new.columns, df_new.shape, X_test.columns, X_test.shape

(Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)', 'Maintenance Type',
        'Temperature (°C)', 'Pressure (bar)', 'Vibration (mm/s)',
        'Operating Hours', 'Replaced Parts', 'Failure Cause', 'Part',
        'Daily Rate (USD)', 'Lead Time (days)', 'In Stock', 'Failure Class',
        'Observed Symptom', 'Bearing', 'Compressor', 'Filter', 'Heat Exchanger',
        'Valve'],
       dtype='object'),
 (800736, 34),
 Index(['Equipment', 'Category', 'Brand', 'Model', 'Location Type',
        'Water Zone', 'Formation Type', 'Drilling Depth (m)',
        'Formation Pressure (bar)', 'Mud Weight (ppg)', 'Mud Viscosity (cP)',
        'Pump Pressure (bar)', 'Pump Flow (L/min)', 'ROP (m/hr)',
        'Hook Load (t)', 'Torque (kN·m)

In [13]:
test_df = X_test.copy()
y_predict = model.predict(X_test)

scaler = StandardScaler()
test_df_scaled = scaler.fit_transform(test_df[similarity_cols])

In [23]:
def recommend_safe_equipment(input_conditions, top_k=5, alpha=0.5, model=None, model_feature_cols=None, xgb_threshold=0.5, encoders_override=None):
    input_df = pd.DataFrame([input_conditions])
    input_scaled = scaler.transform(input_df)
    similarities = cosine_similarity(input_scaled, test_df_scaled).flatten()

    result_df = test_df.copy()
    result_df["Similarity"] = similarities

    sim = result_df["Similarity"].values
    sim_min, sim_max = float(np.min(sim)), float(np.max(sim))
    result_df["Similarity_Norm"] = (sim - sim_min) / (sim_max - sim_min) if sim_max > sim_min else sim

    prob_safe = np.ones(len(result_df), dtype=float)

    if model is not None:
        model_features = None
        if model_feature_cols is not None:
            model_features = list(model_feature_cols)
        elif hasattr(model, "feature_names_in_"):
            model_features = list(model.feature_names_in_)
        else:
            try:
                model_features = list(model.get_booster().feature_names)
            except Exception:
                try:
                    model_features = list(X.columns)
                except Exception:
                    drop_like = {"Failure Class", "Failure Cause", "Replaced Parts"}
                    model_features = [c for c in result_df.columns if c not in drop_like]

        X_test_model = pd.DataFrame(index=result_df.index)
        for col in model_features:
            if col in result_df.columns:
                X_test_model[col] = result_df[col]
            else:
                X_test_model[col] = 0

        encs = encoders_override if encoders_override is not None else globals().get("encoders", {})
        if encs:
            for col, le in encs.items():
                if col in X_test_model.columns:
                    mapping = {cls: idx for idx, cls in enumerate(le.classes_)}
                    X_test_model[col] = (
                        X_test_model[col].astype(str).map(mapping).fillna(-1).astype(int)
                    )

        for col in X_test_model.columns:
            if X_test_model[col].dtype == object:
                X_test_model[col] = pd.to_numeric(X_test_model[col], errors="coerce").fillna(-1)

        proba = model.predict_proba(X_test_model)

        if hasattr(model, "classes_"):
            cls_list = list(model.classes_)
            safe_idx = cls_list.index(0) if 0 in cls_list else 0
        else:
            safe_idx = 0

        prob_safe = proba[:, safe_idx]

    result_df["Prob_Safe"] = prob_safe
    result_df["BlendScore"] = alpha * result_df["Similarity_Norm"] + (1.0 - alpha) * result_df["Prob_Safe"]

    if model is not None:
        candidates = result_df[result_df["Prob_Safe"] >= xgb_threshold]
    else:
        safe_label_col = "Failure Cause" if "Failure Cause" in result_df.columns else "Failure Class"
        candidates = result_df[result_df[safe_label_col] == 0]

    cols = ["Equipment", "Similarity", "Prob_Safe", "BlendScore"]
    for c in cols:
        if c not in candidates.columns:
            candidates[c] = np.nan

    return candidates.sort_values(by="BlendScore", ascending=False).head(top_k)[cols].reset_index(drop=True)

In [24]:
input_example = {
    'Drilling Depth (m)': 3100,
    'Formation Pressure (bar)': 250,
    'Temperature (°C)': 105,
    'Pump Pressure (bar)': 60,
    'Vibration (mm/s)': 1.5,
    'Operating Hours': 20
}

recs = recommend_safe_equipment(input_example, top_k=5, alpha=0.5, model=model, xgb_threshold=0.5)

In [25]:
for index, row in recs.iterrows():
    print(row.to_dict())

{'Equipment': 19.0, 'Similarity': 0.9572425835939413, 'Prob_Safe': 0.6490461230278015, 'BlendScore': 0.8194287727601983}
{'Equipment': 8.0, 'Similarity': 0.8342498217163408, 'Prob_Safe': 0.7025080323219299, 'BlendScore': 0.8149196140451453}
{'Equipment': 1.0, 'Similarity': 0.8181247288031616, 'Prob_Safe': 0.6919275522232056, 'BlendScore': 0.8055336067092196}
{'Equipment': 32.0, 'Similarity': 0.813616417916691, 'Prob_Safe': 0.6895014643669128, 'BlendScore': 0.803175453581028}
{'Equipment': 21.0, 'Similarity': 0.8440817552523082, 'Prob_Safe': 0.6709591746330261, 'BlendScore': 0.8016424924693772}
