# README
This notebook is designed to extract life labels from batteries. We observe that some batteries have degraded to 82.5% State of Health (SOH) but not below 80% SOH. In the [Battery-ML](https://github.com/microsoft/BatteryML), the cycle life labels for these batteries were originally labeled as `cycle_number + 1`, which can introduce significant intrinsic errors in the labels.

To mitigate this issue, for batteries with an SOH between 82.5% and 80%, we use Linear Regression to estimate their life labels. Batteries with a minimum SOH no smaller than 82.5% are excluded from our datasets.

Note: This notebook is not applicable to `SMICH`. The life labels of `SMICH` are obtained from the linear interpolation based on the small-current Rest Performance Tests.

In [1]:
import pickle
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
import os

In [None]:
dataset_name = 'total_MICH'
dataset_root_path = '/data/trf/python_works/Battery-LLM/dataset'
dataset_path = f'{dataset_root_path}/{dataset_name}'
files = os.listdir(dataset_path)
need_keys = ['current_in_A', 'voltage_in_V', 'charge_capacity_in_Ah', 'discharge_capacity_in_Ah', 'time_in_s']

In [3]:
name_lables = {}
abadon_count = 0
for file_name in files:
    data = pickle.load(open(f'{dataset_path}/{file_name}', 'rb'))
    cycle_data = data['cycle_data']
    last_cycle = cycle_data[-1]
    if file_name.startswith('RWTH'):
        nominal_capacity = 1.85
    elif file_name.startswith('SNL_18650_NCA_25C_20-80'):
        nominal_capacity = 3.2
    else:
        nominal_capacity = data['nominal_capacity_in_Ah']
        print(nominal_capacity)
    SOC_interval = data['SOC_interval'] # get the charge and discharge soc interval
    SOC_interval = SOC_interval[1] - SOC_interval[0]
    last_cycle_soh = max(last_cycle['discharge_capacity_in_Ah']) / nominal_capacity / SOC_interval
    if last_cycle_soh >= 0.825:
        # [0.825, inf)
        # exclude this cell from the dataset
        abadon_count += 1
        continue
    elif last_cycle_soh > 0.8:
        # (0.8, 0.825)
        # Linear Regression based on the last 20 cycles to obtain the cycle life label
        regress_cycle_num = 20
        total_SOHs = []
        total_cycle_numbers = np.array([i+1 for i in range(len(cycle_data)-regress_cycle_num, len(cycle_data))])
        for correct_cycle_index, sub_cycle_data in enumerate(cycle_data[-regress_cycle_num:]):
            Qd = max(sub_cycle_data['discharge_capacity_in_Ah'])
            cycle_number = sub_cycle_data['cycle_number']
            soh = Qd / nominal_capacity / SOC_interval
            total_SOHs.append(soh)
        
        total_SOHs = np.array(total_SOHs).reshape(-1,1)
        linear_regressor = LinearRegression()
        linear_regressor.fit(total_SOHs, total_cycle_numbers)
        eol = linear_regressor.predict(np.array([0.80]).reshape(-1,1))[0]
        eol = int(eol)
    else:
        # (-inf, 0.8]
        eol, find_eol = None, False
        for correct_cycle_index, sub_cycle_data in enumerate(cycle_data):
            Qd = max(sub_cycle_data['discharge_capacity_in_Ah'])
            soh = Qd / nominal_capacity / SOC_interval
            if soh <= 0.8 and not find_eol:
                eol = correct_cycle_index + 1
                find_eol = True
                break
        # if not find_eol:
        #     # The end of life is not found in the battery
        #     eol = len(cycle_data) + 1
    name_lables[file_name] = eol
    print(file_name, eol)

print(f'Totally {len(name_lables)} batteries have life labels | {abadon_count} batteries are excluded.')
print(f'Labels are saved in {dataset_root_path}/{dataset_name}_labels.json')
if dataset_name == 'UL_PUR':
    with open(f'{dataset_root_path}/UL-PUR_labels.json', 'w') as f:
        json.dump(name_lables, f)
elif dataset_name == 'ZN_coin':
    with open(f'{dataset_root_path}/ZN-coin_labels.json', 'w') as f:
        json.dump(name_lables, f)
elif dataset_name == 'NA_coin':
    with open(f'{dataset_root_path}/NA-coin_labels.json', 'w') as f:
        json.dump(name_lables, f)
else:
    with open(f'{dataset_root_path}/{dataset_name}_labels.json', 'w') as f:
        json.dump(name_lables, f)
    
    

50.51223
49.69816
59.35708
59.37707
59.10328
59.10791
50.63432
60.17728
59.10292
59.03642
50.50464
58.81946
58.73618
58.09073
58.97648
59.49447
50.56625
60.41077
59.72155
59.17059
59.25583
50.11591
CALB_0_B188.pkl 208
60.39165
59.03297
50.3262
59.01923
50.22162
Totally 1 batteries have life labels | 26 batteries are excluded.
Labels are saved in /data/trf/python_works/Battery-LLM/dataset/CALB_labels.json
