# Train and Inference

In [5]:
import os
import shutil
import wget
import zipfile
import pandas as pd
import datetime
import numpy as np
from glob import glob

import sys
sys.path.append("..")
from algorithms.Networks_pytorch import *
from algorithms.Dataset_manipulation import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

In [None]:
# TODO: still a lot of work to do here

## Config

Make sure to change these configs before running the whole notebook.

In [None]:
#model = 'ST4000DM000'
# here you can select the model. This is the one tested.
model = 'ST3000DM001'
#years = ['2016', '2017', '2018']
years = ['2014', '2015', '2016', '2017', '2018']
# many parameters that could be changed, both for unbalancing, for networks and for features.
enable_windowing = True
min_days_HDD = 115
# TODO: Can be adjusted by dynamic parameters
days_considered_as_failure = 7
test_train_perc = 0.3
# type of oversampling
oversample_undersample = 2
# balancing factor (major/minor = balancing_normal_failed)
# TODO: We can calculate the imbalance ratio of the dataset and use this ratio to adjust the balancing factor.
balancing_normal_failed = 20
kernel_size = 32
# type of classifier
classifier = 'LSTM'
# if you extract features for RF for example. Not tested
perform_features_extraction = False
CUDA_DEV = "0"
# if automatically select best features
ranking = 'Ok'
num_features = 18
overlap = 1

## Train and Inference

The main function of the code is to perform hard disk failure prediction using various classification algorithms. The process involves the following steps:

```mermaid
graph TD
    A[Start]
    B{Step 1: Load Dataset}
    C[Step 1.1: Import Data]
    D[Step 1.2: Filter Out Bad HDs]
    E[Step 1.3: Define RUL Piecewise]
    F[Step 1.4: Subflowchart: <br>Feature Selection]
    G[Step 1.5: Subflowchart: <br>Partition Dataset]
    H{Classifier Selection}
    I[RandomForest]
    J[Subflowchart: <br>TCN]
    K[Subflowchart: <br>LSTM]
    L[Feature Extraction]
    M[Reshape Data]
    N{Subflowchart: <br>Perform Classification}
    O[End]
    A --> B
    B -- Fail --> C
    C --> D
    D --> E
    E --> F
    F --> G
    B -- Success --> G
    G --> H
    H --> I
    H --> J
    H --> K
    I --> N
    J --> N
    K --> N
    N --> O
    L -- If perform_features_extraction is True --> M
    M --> N

```

In [None]:
try:
    # Step 1: Load the dataset from pkl file.
    df = pd.read_pickle(os.path.join('.', '..', 'output', f'{model}_Dataset_windowed_{history_signal}_rank_{ranking}_{num_features}_overlap_{overlap}.pkl'))
except:
    # Step 1.1: Import the dataset from the raw data.
    if ranking == 'None':
        df = import_data(years=years, model=model, name='iSTEP', features=features)
    else:
        df = import_data(years=years, model=model, name='iSTEP')
    df.set_index(['serial_number', 'date'], inplace=True)
    print("DF index name:", df.index.names)
    print(df.head())
    for column in list(df):
        missing = round(df[column].notna().sum() / df.shape[0] * 100, 2)
        print('{:.<27}{}%'.format(column, missing))
    # drop bad HDs
    # Step 1.2: Filter out the bad HDDs.
    bad_missing_hds, bad_power_hds, df = filter_HDs_out(df, min_days=min_days_HDD, time_window='30D', tolerance=30)
    # predict_val represents the prediction value of the failure
    # validate_val represents the validation value of the failure
    # Step 1.3: Define RUL(Remain useful life) Piecewise
    df['predict_val'], df['validate_val'] = generate_failure_predictions(df, days=days_considered_as_failure, window=history_signal)
    if ranking != 'None':
        # Step 1.4: Feature Selection: Subflow chart of Main Classification Process
        df = feature_selection(df, num_features)
    print('Used features')
    for column in list(df):
        print('{:.<27}'.format(column,))
    # print('Saving to pickle file...')
    #df.to_pickle(os.path.join(script_dir, '..', 'output', f'{model}_Dataset_windowed_{history_signal}_rank_{ranking}_{num_features}_overlap_{overlap}.pkl'))

### Step 1.5:  Partition Dataset

Partition the dataset into training and testing sets.


```mermaid

graph TD
    A[Start: dataset_partitioning] --> B[Step 1.1: Reset Index and Step 1.2: Preprocess Data]
    B --> C{Step 2: Check Windowing}
    C -- Yes --> D[Attempt to Load Pre-existing Windowed Dataset]
    D -- Success --> E[Loaded Existing Dataset]
    E --> F[Prepare Data for Modeling]
    D -- Failure --> G[Windowing Process]
    G --> F
    C -- No --> F
    F --> H{Technique Selection}
    H -- Random --> I[Random Partitioning]
    H -- HDD --> J[HDD Partitioning]
    H -- Other --> K[Other Technique]
    I --> L[Apply Sampling Techniques]
    J --> L
    K --> L
    L --> M[Final Dataset Creation]
    M --> N[Return Train and Test Sets]

```

In [None]:
technique='random'
window_dim=kernel_size
resampler_balancing=balancing_normal_failed


#### Step 1.1: Reset Index

In [None]:
df.reset_index(inplace=True)
"DF index name:", df.index.names

#### Step 1.2: Preprocess Data

> ```python
> def partition(self):
> ```

In [None]:
mms = MinMaxScaler(feature_range=(0, 1)) # Normalize the dataset

# Extract temporal data
# Updated: temporal now also drops 'model' and 'capacity_bytes' columns, because they are object. We need float64.
temporal = df[['serial_number', 'date', 'failure', 'predict_val', 'validate_val', 'model', 'capacity_bytes']]
df.drop(columns=temporal.columns, inplace=True)
df = pd.DataFrame(mms.fit_transform(df),
columns=df.columns, index=df.index)  # FIXME: 
df = pd.concat([df, temporal], axis=1)


#### Step 2: Check Windowing

> ```python
> def handle_windowing(self):
> ```

In [None]:
windowed_df = df.copy()

##### Step 2.2: Not checking for windowing

In [None]:
if not enable_windowing:
    windowed_df = df

##### Step 2.1: Checking for windowing

In [None]:
def perform_windowing():
    raise NotImplementedError("封的真好，下次别封了")

In [None]:
if enable_windowing:
    try:
        # Step 2.1.1: If Yes, attempt to load the pre-processed windowed dataset.
        windowed_df = pd.read_pickle(
            os.path.join(
                ".",
                "..",
                "output",
                f"{model}_Dataset_windowed_{window_dim}_rank_{rank}_{num_features}_overlap_{overlap}.pkl",
            )
        )
        print("Loading the windowed dataset")
        windowed_df
        # straight to Step 3
        # return rename_columns(windowed_df)
    except FileNotFoundError:
        # Step 2.1.2: If No, perform windowing on the dataset.
        print("Windowing the df")  # FIXME: Currently all columns are indexed.
        # From now on, `def perform_windowing(self):`
        # TODO: put perform_windowing here
        windowed_df = perform_windowing()


#### Step 3: Prepare Data for Modeling

> ```python
> def rename_columns(self, df):
> ```
Note: In `Dataset_manipulation.py`, the `df` here should be `windowed_df`, returns to and ultimately assigned back to `windowed_df`.

In [None]:
cols = []
count = {}
print("\nTEST", windowed_df.columns)
for column in windowed_df.columns:
    if column not in count:
        count[column] = 0
    count[column] += 1
    new_column = f"{column}_{count[column]}" if count[column] > 1 else column
    cols.append(new_column)
windowed_df.columns = cols
windowed_df.sort_index(axis=1, inplace=True)

print("\nTest: ", count["predict_val"])

#### Step 4: Technique Selection

Creating training and test dataset

> ```python
> def random_split(self, df):
> ```

> ```python
> def preprocess_random(self, df):
> ```

In [None]:
if technique == "random":
    # if self.windowing == 1:
    #     X = self.arrays_to_matrix(X)
    # TODO: 老母猪戴胸罩，一套又一套！

#  ----

----------------------------

In [None]:
def classification(X_train, Y_train, X_test, Y_test, classifier, metric, **args):
    """
    Perform classification using the specified classifier.
    --- Step 1.7: Perform Classification
    Parameters:
    - X_train (array-like): Training data features.
    - Y_train (array-like): Training data labels.
    - X_test (array-like): Test data features.
    - Y_test (array-like): Test data labels.
    - classifier (str): The classifier to use. Options: 'RandomForest', 'TCN', 'LSTM'.
    - metric (str): The metric to evaluate the classification performance.
    - **args: Additional arguments specific to each classifier.

    Returns:
    - None
    """
    print('Classification using {} is starting'.format(classifier))
    Y_test_real = []
    prediction = []
    if classifier == 'RandomForest':
        # Step 1.7.1: Perform Classification using RandomForest: Use RandomForest Libaray. Train and validate the network using RandomForest.
        X_train, Y_train = shuffle(X_train, Y_train)
        # Use third-party RandomForest library.
        model = RandomForestClassifier(n_estimators=30, min_samples_split=10, random_state=3)
        model.fit(X_train[:, :], Y_train)
        prediction = model.predict(X_test)
        Y_test_real = Y_test
        report_metrics(Y_test_real, prediction, metric)
    elif classifier == 'TCN':
        # Step 1.7.2: Perform Classification using TCN. Subflowchart: TCN Subflowchart. Train and validate the network using TCN
        net_train_validate_TCN(args['net'], args['optimizer'], X_train, Y_train, X_test, Y_test, args['epochs'], args['batch_size'], args['lr'])
    elif classifier == 'LSTM':
        # Step 1.7.3: Perform Classification using LSTM. Subflowchart: LSTM Subflowchart. Train and validate the network using LSTM
        train_dataset = FPLSTMDataset(X_train, Y_train)
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=FPLSTM_collate)
        test_dataset = FPLSTMDataset(X_test, Y_test.values)
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=True, collate_fn=FPLSTM_collate)
        net_train_validate_LSTM(args['net'], args['optimizer'], train_loader, test_loader, args['epochs'], X_test.shape[0], Xtrain.shape[0], args['lr'])
        pass

