In [7]:
import pandas as pd

# Load the dataset
file_path = 'full_devices.csv'
df = pd.read_csv(file_path, encoding='latin1')

# Display basic info and first few rows
df.info()
# Preview unique devices and columns
unique_devices = df['device'].nunique()
print(f"Number of unique devices: {unique_devices}")
# Display covered period
covered_period = df['date'].min(), df['date'].max()
print(f"Covered period: {covered_period}")
# Evaluate if all devices in the database had at least one failure
devices_with_failure = df.groupby('device')['failure'].max()
all_failed = devices_with_failure.all()
print(f"Did all devices have at least one failure? {all_failed}")
print(f"Number of devices that never failed: {(devices_with_failure == 0).sum()}")
# Number of devices that had at least one failure
num_failed_devices = (devices_with_failure > 0).sum()
print(f"Number of devices that had at least one failure: {num_failed_devices}")
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   date        124494 non-null  object
 1   device      124494 non-null  object
 2   failure     124494 non-null  int64 
 3   attribute1  124494 non-null  int64 
 4   attribute2  124494 non-null  int64 
 5   attribute3  124494 non-null  int64 
 6   attribute4  124494 non-null  int64 
 7   attribute5  124494 non-null  int64 
 8   attribute6  124494 non-null  int64 
 9   attribute7  124494 non-null  int64 
 10  attribute8  124494 non-null  int64 
 11  attribute9  124494 non-null  int64 
dtypes: int64(10), object(2)
memory usage: 11.4+ MB
Number of unique devices: 1169
Covered period: ('2015-01-01', '2015-11-02')
Did all devices have at least one failure? False
Number of devices that never failed: 1063
Number of devices that had at least one failure: 106


Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [11]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier  # <- Novo modelo

# Modelos existentes
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMClassifier

# --- Load Data ---
df = pd.read_csv('full_devices.csv', encoding='latin1')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['device', 'date']).reset_index(drop=True)

# --- Create Target (Predicting one day before failure) ---
df['target'] = df.groupby('device')['failure'].shift(-1).fillna(0)

# --- Rolling Statistics ---
rolling_window = 5
attr_cols = [f'attribute{i}' for i in range(1, 10)]

for col in attr_cols:
    df[f'{col}_roll_mean'] = df.groupby('device')[col].transform(lambda x: x.rolling(window=rolling_window, min_periods=1).mean())
    df[f'{col}_roll_std'] = df.groupby('device')[col].transform(lambda x: x.rolling(window=rolling_window, min_periods=1).std().fillna(0))

# --- Device Stratified Train/Test Split ---
device_failures = df.groupby('device')['failure'].max()
failed_devices = device_failures[device_failures == 1].index
healthy_devices = device_failures[device_failures == 0].index

train_failed, test_failed = train_test_split(failed_devices, test_size=0.2, random_state=42)
train_healthy, test_healthy = train_test_split(healthy_devices, test_size=0.2, random_state=42)

train_devices = list(train_failed) + list(train_healthy)
test_devices = list(test_failed) + list(test_healthy)

train_df = df[df['device'].isin(train_devices)].copy()
test_df = df[df['device'].isin(test_devices)].copy()

# --- Prepare Feature Matrix ---
feature_cols = attr_cols + [f'{col}_roll_mean' for col in attr_cols] + [f'{col}_roll_std' for col in attr_cols]

X_train = train_df[feature_cols]
y_train = train_df['target']
X_test = test_df[feature_cols]
y_test = test_df['target']

# --- Scale Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Apply SMOTE to Training Data ---
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# --- Train and Evaluate Models ---
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Balanced RF": BalancedRandomForestClassifier(random_state=42),  # <- Novo modelo adicionado
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "HistGradientBoosting": HistGradientBoostingClassifier(),
    "LightGBM": LGBMClassifier()
}

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred, digits=4))

for name, model in models.items():
    model.fit(X_resampled, y_resampled)
    evaluate_model(name, model, X_test_scaled, y_test)



=== Random Forest ===
[[25337   118]
 [   22     0]]
              precision    recall  f1-score   support

         0.0     0.9991    0.9954    0.9972     25455
         1.0     0.0000    0.0000    0.0000        22

    accuracy                         0.9945     25477
   macro avg     0.4996    0.4977    0.4986     25477
weighted avg     0.9983    0.9945    0.9964     25477


=== Balanced RF ===
[[25348   107]
 [   21     1]]
              precision    recall  f1-score   support

         0.0     0.9992    0.9958    0.9975     25455
         1.0     0.0093    0.0455    0.0154        22

    accuracy                         0.9950     25477
   macro avg     0.5042    0.5206    0.5064     25477
weighted avg     0.9983    0.9950    0.9966     25477


=== Logistic Regression ===
[[24054  1401]
 [    7    15]]
              precision    recall  f1-score   support

         0.0     0.9997    0.9450    0.9716     25455
         1.0     0.0106    0.6818    0.0209        22

    accuracy    

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBoost ===
[[25384    71]
 [   21     1]]
              precision    recall  f1-score   support

         0.0     0.9992    0.9972    0.9982     25455
         1.0     0.0139    0.0455    0.0213        22

    accuracy                         0.9964     25477
   macro avg     0.5065    0.5213    0.5097     25477
weighted avg     0.9983    0.9964    0.9973     25477


=== HistGradientBoosting ===
[[25317   138]
 [   18     4]]
              precision    recall  f1-score   support

         0.0     0.9993    0.9946    0.9969     25455
         1.0     0.0282    0.1818    0.0488        22

    accuracy                         0.9939     25477
   macro avg     0.5137    0.5882    0.5229     25477
weighted avg     0.9985    0.9939    0.9961     25477

[LightGBM] [Info] Number of positive: 98933, number of negative: 98933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008893 seconds.
You can set `force_row_wise=true` to remove the overhead.
And 

