In [33]:
import os
import gc
import sys
import uuid
import math
import copy
import time
import glob
import joblib
import pathlib
import warnings
import itertools
from pprint import pprint
from typing import List, Tuple

warnings.filterwarnings("ignore")

In [28]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score


from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import silhouette_score
from sklearn.metrics import accuracy_score, classification_report

In [3]:
LABELS = ["fault_type", "fault_source"]

In [4]:
def load_df(source: str) -> pd.DataFrame:
    root_dir = os.path.join(os.getcwd(), "Train")

    df_list = []
    for csv_name in os.listdir(root_dir):
        if not csv_name.endswith(".csv"):
            continue
        
        csv_path = os.path.join(root_dir, csv_name)
        df = pd.read_csv(csv_path)
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df = df.set_index("timestamp")
        df_list.append(df)
        
    df = pd.concat(df_list, axis=1)
    
    return df

def reduce_mem_usage(df: pd.DataFrame, verbose: bool = True):
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    changes = {}

    for col in df.columns:
        s = df[col]
        old_dtype = s.dtype

        # Skip datetime, timedelta, bool
        if (pd.api.types.is_datetime64_any_dtype(s) or
            pd.api.types.is_timedelta64_dtype(s) or
            pd.api.types.is_bool_dtype(s)):
            continue

        # Numeric columns
        if pd.api.types.is_numeric_dtype(s):
            c_min = s.min(skipna=True)
            c_max = s.max(skipna=True)

            if pd.api.types.is_integer_dtype(s):
                # Downcast integers (keep sign)
                if c_min >= 0:
                    if c_max <= np.iinfo(np.uint8).max:
                        df[col] = s.astype(np.uint8)
                    elif c_max <= np.iinfo(np.uint16).max:
                        df[col] = s.astype(np.uint16)
                    elif c_max <= np.iinfo(np.uint32).max:
                        df[col] = s.astype(np.uint32)
                    else:
                        df[col] = s.astype(np.uint64)
                else:
                    if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                        df[col] = s.astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                        df[col] = s.astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                        df[col] = s.astype(np.int32)
                    else:
                        df[col] = s.astype(np.int64)

            elif pd.api.types.is_float_dtype(s):
                # Prefer float32 for stability; use float16 only if you really want it
                if (c_min > np.finfo(np.float32).min) and (c_max < np.finfo(np.float32).max):
                    df[col] = s.astype(np.float32)
                else:
                    df[col] = s.astype(np.float64)

        # Object columns: optionally convert to category (avoid numeric comparisons!)
        elif pd.api.types.is_object_dtype(s):
            nunique = s.nunique(dropna=True)
            if nunique / max(1, len(s)) < 0.5:
                df[col] = s.astype("category")

        # record change
        if df[col].dtype != old_dtype:
            changes[col] = f"{old_dtype} -> {df[col].dtype}"

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Memory before: {start_mem:,.2f} MB")
        print(f"Memory after : {end_mem:,.2f} MB")
        
        if 0 < start_mem:
            print(f"Decreased by : {(100*(start_mem-end_mem)/start_mem):.1f}%")
        if changes:
            print("Changed dtypes:")
            for k, v in changes.items():
                print(f"  - {k}: {v}")

    return df


class JointLabelEncoder:
    def __init__(self, columns: List[str]):
        self.columns = columns
        self.tupel_to_label = {}

    def fit(self, df: pd.DataFrame) -> None:
        ldf = df.dropna(subset=self.columns)
        for idx, row in ldf.iterrows():
            t = tuple(row[self.columns])
            self.tupel_to_label.setdefault(t, len(self.tupel_to_label))
        
    def transform(self, df: pd.DataFrame) -> pd.Series:
        labels = {}
        ldf = df.dropna(subset=self.columns)
        for ts, row in ldf.iterrows():
            t = tuple(row[self.columns])
            label = self.tupel_to_label[t]
            labels[ts] = label

        return pd.Series(labels)

    def fit_transform(self, df: pd.DataFrame) -> pd.Series:
        self.fit(df)
        return self.transform(df)

    def reverse(self, s: pd.Series) -> pd.DataFrame:
        pass



def multivariate_sliding_window(data, window_size, horizon=1):
    # data: [T, C]
    X = sliding_window_view(data, (window_size, data.shape[1]))  # [T-W+1, W, C]
    y = data[window_size + horizon - 1 :, :]                     # target per channel
    X = X[:len(y)]
    return X, y

In [5]:
sdf = load_df("Train")
sdf = reduce_mem_usage(sdf)
# sdf = sdf.dropna()


sdf

Memory before: 6.09 MB
Memory after : 2.12 MB
Decreased by : 65.1%
Changed dtypes:
  - Pressure_kPa: float64 -> float32
  - pH_units: float64 -> float32
  - Torque_Nm: float64 -> float32
  - Speed_RPM: float64 -> float32
  - BearingTemp_C: float64 -> float32
  - FlowRate_L_min: float64 -> float32
  - fault_type: object -> category
  - fault_source: object -> category
  - VibVelocity_mm_s: float64 -> float32
  - Voltage_V: float64 -> float32
  - VibDisp_mm: float64 -> float32
  - Power_kW: float64 -> float32
  - Temperature_C: float64 -> float32
  - OilLevel_cm: float64 -> float32
  - VibAccel_m_s2: float64 -> float32
  - Humidity_pct: float64 -> float32


Unnamed: 0_level_0,Pressure_kPa,pH_units,Torque_Nm,Speed_RPM,BearingTemp_C,FlowRate_L_min,fault_type,fault_source,VibVelocity_mm_s,Voltage_V,VibDisp_mm,Power_kW,Temperature_C,OilLevel_cm,VibAccel_m_s2,Humidity_pct
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2025-01-01 00:00:09,,,,,,,,,,,,54.434551,56.498234,,,
2025-01-01 00:00:16,369.269531,,,1308.442505,,,,,,232.035858,,47.701221,52.324596,,,
2025-01-01 00:00:32,,,,,,,,,,,,45.466019,55.082039,,,
2025-01-01 00:00:48,351.796448,,,1604.820679,,,,,,230.535553,,48.191322,,,,
2025-01-01 00:01:08,,,95.619347,,,,,,10.41659,,,48.638268,56.228180,,,61.579983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 17:37:09,,,,,,,,,,,,59.509815,71.231041,,,
2025-01-01 17:37:10,370.289703,,,1552.804321,,,,,,234.387787,,59.508759,71.231041,,,
2025-01-01 17:37:11,,,,,,,,,,,,59.507698,71.231041,,,
2025-01-01 17:37:12,345.904724,,,1385.480957,,,,,,,,59.506641,71.231041,,,


In [24]:
le = JointLabelEncoder(columns=["fault_type", "fault_source"])

df = sdf.copy()
df["label"] = le.fit_transform(df)
df = df.drop(columns=["fault_type", "fault_source"])

for col in df.columns:
    if "label" != col:
        df[col] = df[col].ffill().bfill()
    
df = df.dropna()

print()
print(le.tupel_to_label)
print(df.label.value_counts())

df


{('sensor_fault', 'sensor_fault'): 0, ('misalignment', 'equipment_fault'): 1, ('lubrication_fault', 'equipment_fault'): 2, ('bearing_fault', 'equipment_fault'): 3}
label
2.0    825
1.0    706
3.0    665
0.0     54
Name: count, dtype: int64


Unnamed: 0_level_0,Pressure_kPa,pH_units,Torque_Nm,Speed_RPM,BearingTemp_C,FlowRate_L_min,VibVelocity_mm_s,Voltage_V,VibDisp_mm,Power_kW,Temperature_C,OilLevel_cm,VibAccel_m_s2,Humidity_pct,label
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2025-01-01 00:07:52,352.616882,7.556472,95.927788,1550.108154,70.457932,25.855133,10.351611,230.076569,0.527706,56.536087,52.652660,11.176676,28.016966,63.395599,0.0
2025-01-01 00:19:28,384.431274,7.487083,108.818893,1642.926758,75.647377,28.969305,10.677340,230.569717,0.574278,59.693821,60.871071,10.819815,27.909643,59.631771,1.0
2025-01-01 00:31:25,368.781738,7.614868,108.402390,1609.107056,77.478401,29.406160,11.859204,234.976410,0.533868,58.849117,62.487942,10.571486,27.909643,66.631966,0.0
2025-01-01 00:45:37,397.295654,7.873587,125.759087,1486.179565,79.232849,29.664143,12.325464,233.821732,0.571275,60.683193,65.308647,11.193770,29.309050,69.531822,0.0
2025-01-01 00:57:46,431.504303,7.873587,129.957077,1486.179565,83.107437,30.280823,12.220066,235.253052,0.616031,63.369164,65.219734,11.707875,29.309050,67.375435,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-01 17:36:17,321.945862,8.124319,105.631897,1594.971069,72.443832,21.010283,10.639228,234.387787,0.574580,59.564831,71.231041,11.478136,24.125578,60.932819,2.0
2025-01-01 17:36:32,364.318359,8.124319,112.926552,1634.298462,74.626236,21.601879,11.761463,234.387787,0.491628,59.548962,71.231041,12.391875,28.187197,62.542015,2.0
2025-01-01 17:36:48,316.847504,8.124319,97.825432,1647.065308,66.110825,26.574749,11.692189,234.387787,0.521074,59.532032,71.231041,11.885188,25.644306,60.717670,1.0
2025-01-01 17:37:03,370.488770,8.124319,111.315987,1551.630859,68.316170,19.345587,11.008834,234.387787,0.511307,59.516163,71.231041,10.889993,27.617172,57.397007,1.0


In [25]:
X = df.drop(columns=["label"]).values
y = df.label.to_numpy()

scaler = RobustScaler()
X = scaler.fit_transform(X)

X.shape, y.shape

((2250, 14), (2250,))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3570
[LightGBM] [Info] Number of data points in the train set: 1800, number of used features: 14
[LightGBM] [Info] Start training from score -3.734342
[LightGBM] [Info] Start training from score -1.158716
[LightGBM] [Info] Start training from score -1.003302
[LightGBM] [Info] Start training from score -1.218898


In [64]:
y_val_pred = model.predict(X_val)
print("R² Score:", r2_score(y_val, y_val_pred))
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Report:\n", classification_report(y_val, y_val_pred))

R² Score: -0.6296900489396411
Accuracy: 0.49333333333333335
Report:
               precision    recall  f1-score   support

         0.0       0.50      1.00      0.67         1
         1.0       0.56      0.49      0.52        79
         2.0       0.55      0.60      0.57        89
         3.0       0.32      0.32      0.32        56

    accuracy                           0.49       225
   macro avg       0.48      0.60      0.52       225
weighted avg       0.49      0.49      0.49       225



In [144]:
sdf = load_df("Test")
sdf = reduce_mem_usage(sdf)

sdf

Memory before: 0.20 MB
Memory after : 0.15 MB
Decreased by : 25.2%
Changed dtypes:
  - FlowRate_L_min: float64 -> float32
  - Humidity_pct: float64 -> float32
  - pH_units: float64 -> float32
  - Voltage_V: float64 -> float32
  - OilLevel_cm: float64 -> float32
  - Power_kW: float64 -> float32
  - Pressure_kPa: float64 -> float32
  - Speed_RPM: float64 -> float32
  - VibAccel_m_s2: float64 -> float32
  - Torque_Nm: float64 -> float32
  - BearingTemp_C: float64 -> float32
  - VibDisp_mm: float64 -> float32
  - Temperature_C: float64 -> float32
  - VibVelocity_mm_s: float64 -> float32


Unnamed: 0_level_0,FlowRate_L_min,Humidity_pct,pH_units,Voltage_V,OilLevel_cm,Power_kW,Pressure_kPa,Speed_RPM,VibAccel_m_s2,Torque_Nm,BearingTemp_C,VibDisp_mm,Temperature_C,VibVelocity_mm_s
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2025-01-01 17:37:29,22.055693,61.374119,,,10.807535,59.488655,,,,93.472755,,0.556730,71.231041,11.096394
2025-01-01 17:37:44,22.945406,60.547783,8.124319,234.387787,10.872773,59.472786,345.433075,1526.930908,24.413179,104.516548,73.313477,0.495622,71.231041,11.560768
2025-01-01 17:37:59,24.287403,60.763233,,,11.582364,59.456917,,,,111.226997,,0.537948,71.231041,11.527083
2025-01-01 17:38:14,21.896917,64.372490,8.124319,234.387787,11.227240,59.441048,,1385.757812,,105.881294,71.575768,0.530531,71.231041,11.010122
2025-01-01 17:38:29,21.720751,58.418423,,,11.435504,59.425179,,,,106.391548,,0.531126,71.231041,11.402783
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-01-02 17:52:55,21.697428,49.388371,,,7.083466,46.171192,,,,85.953117,,0.404501,49.581970,8.367945
2025-01-02 18:09:17,25.074961,51.462196,7.398386,,8.938301,50.270943,,1423.196045,,90.045937,64.030128,0.546029,54.486340,9.308586
2025-01-02 18:16:34,,57.699974,,,7.676182,48.730251,,,,97.796013,,0.521224,52.557850,9.097073
2025-01-02 18:28:14,26.808767,59.045311,7.498446,228.110138,9.666829,51.476578,294.316833,1490.914673,23.707520,106.011101,63.868748,0.609531,55.993114,9.702516


In [145]:
test_df = load_df("Test")
# test_df = test_df.fillna(0)
test_df = test_df[X.columns]

X_test_scaled = scaler.transform(test_df)
y_test_hat = model.predict(X_test_scaled)

fault_type_pred = y_test_hat // 2
fault_type_pred_cat = fault_type_le.inverse_transform(fault_type_pred)

fault_source_pred = y_test_hat % 2
fault_source_pred_cat = fault_source_le.inverse_transform(fault_source_pred)

submission = pd.DataFrame({"fault_type": fault_type_pred_cat, "fault_source": fault_source_pred_cat}).round(6)

print(len(test_df))

submission

965


Unnamed: 0,fault_type,fault_source
0,lubrication_fault,equipment_fault
1,lubrication_fault,equipment_fault
2,lubrication_fault,equipment_fault
3,lubrication_fault,equipment_fault
4,lubrication_fault,equipment_fault
...,...,...
960,lubrication_fault,sensor_fault
961,lubrication_fault,sensor_fault
962,lubrication_fault,sensor_fault
963,bearing_fault,sensor_fault


<h2 dir=rtl align=right style="line-height:200%;font-family:vazir;color:#0099cc">
<font face="vazir" color="#0099cc">
<b>سلول جواب‌ساز</b>
</font>
</h2>

<p dir=rtl style="direction: rtl; text-align: justify; line-height:200%; font-family:vazir; font-size:medium">
<font face="vazir" size=3>
    برای ساخته‌شدن فایل <code>result.zip</code> سلول زیر را اجرا کنید. توجه داشته باشید که پیش از اجرای سلول زیر تغییرات اعمال شده در نت‌بوک را ذخیره کرده باشید (<code>ctrl+s</code>) در غیر این صورت، در پایان مسابقه نمره شما به صفر تغییر خواهد کرد.
    <br>
    همچنین اگر از کولب برای اجرای این فایل نوت‌بوک استفاده می‌کنید، قبل از ارسال فایل <code>result.zip</code>، آخرین نسخه‌ی نوت‌بوک خود را دانلود کرده و داخل فایل ارسالی قرار دهید.
</font>

In [146]:
import zipfile
import joblib
import os

if not os.path.exists(os.path.join(os.getcwd(), 'notebook.ipynb')):
    %notebook -e notebook.ipynb

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)
file_names = ['notebook.ipynb', 'submission.csv']
compress(file_names)

File Paths:
['notebook.ipynb', 'submission.csv']
