In [15]:
import pandas as pd

df = pd.read_parquet("../data/processed/feature_engineered.parquet")

target = "fail_in_7_days"

drop_cols = [
    target,
    "failure",
    "date",
    "serial_number",
    "model",
    "capacity_bytes",
    "fail_date"
]

X = df.drop(columns=drop_cols)
y = df[target]

In [16]:
X

Unnamed: 0,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
3223588,,,,,,
3223589,,,,,,
3223590,,,,,,
3223591,,,,,,


In [17]:
y

0          0
1          0
2          0
3          0
4          0
          ..
3223588    0
3223589    0
3223590    0
3223591    0
3223592    0
Name: fail_in_7_days, Length: 3223593, dtype: int64

In [21]:
df[["smart_5_raw",
    "smart_9_raw",
    "smart_187_raw",
    "smart_188_raw",
    "smart_197_raw",
    "smart_198_raw"]].isna().mean()

smart_5_raw      0.007019
smart_9_raw      0.001338
smart_187_raw    0.653395
smart_188_raw    0.655199
smart_197_raw    0.026940
smart_198_raw    0.008002
dtype: float64

In [19]:
# add missing indicators
for col in ["smart_187_raw", "smart_188_raw"]:
    X[col + "_missing"] = X[col].isna().astype(int)

# then fill
X = X.fillna(0)
X

Unnamed: 0,smart_5_raw,smart_9_raw,smart_187_raw,smart_188_raw,smart_197_raw,smart_198_raw,smart_187_raw_missing,smart_188_raw_missing
0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...
3223588,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3223589,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3223590,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3223591,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [20]:
y.value_counts()

fail_in_7_days
0    3223219
1        374
Name: count, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00    644644
           1       0.01      0.37      0.02        75

    accuracy                           1.00    644719
   macro avg       0.51      0.68      0.51    644719
weighted avg       1.00      1.00      1.00    644719

ROC-AUC: 0.8243277529923492


In [25]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[642383   2261]
 [    47     28]]


In [26]:
from sklearn.metrics import average_precision_score
print("PR-AUC:", average_precision_score(y_test, y_prob))

PR-AUC: 0.11466623645397678


In [27]:
import numpy as np
from sklearn.metrics import precision_score, recall_score

thresholds = np.linspace(0.01, 0.5, 20)

for t in thresholds:
    y_pred_t = (y_prob >= t).astype(int)
    r = recall_score(y_test, y_pred_t)
    p = precision_score(y_test, y_pred_t)
    print(f"Threshold={t:.2f} | Recall={r:.3f} | Precision={p:.3f}")


Threshold=0.01 | Recall=0.653 | Precision=0.019
Threshold=0.04 | Recall=0.627 | Precision=0.020
Threshold=0.06 | Recall=0.587 | Precision=0.019
Threshold=0.09 | Recall=0.573 | Precision=0.018
Threshold=0.11 | Recall=0.573 | Precision=0.018
Threshold=0.14 | Recall=0.573 | Precision=0.019
Threshold=0.16 | Recall=0.533 | Precision=0.017
Threshold=0.19 | Recall=0.507 | Precision=0.016
Threshold=0.22 | Recall=0.493 | Precision=0.016
Threshold=0.24 | Recall=0.493 | Precision=0.016
Threshold=0.27 | Recall=0.493 | Precision=0.016
Threshold=0.29 | Recall=0.480 | Precision=0.016
Threshold=0.32 | Recall=0.453 | Precision=0.015
Threshold=0.35 | Recall=0.427 | Precision=0.014
Threshold=0.37 | Recall=0.413 | Precision=0.013
Threshold=0.40 | Recall=0.400 | Precision=0.013
Threshold=0.42 | Recall=0.387 | Precision=0.013
Threshold=0.45 | Recall=0.387 | Precision=0.013
Threshold=0.47 | Recall=0.373 | Precision=0.012
Threshold=0.50 | Recall=0.373 | Precision=0.012
