## Download Data

Our work used the open source dataset introduced in  ['Data driven prediciton of battery cycle life before capacity degradation' by K.A. Severson, P.M. Attia, et al](https://www.nature.com/articles/s41560-019-0356-8). This notebook is a modified data processing tool based on the [original code](https://github.com/rdbraatz/data-driven-prediction-of-battery-cycle-life-before-capacity-degradation). 

In [None]:
import requests
import os

urls = ["https://data.matr.io/1/api/v1/file/5c86c0b5fa2ede00015ddf66/download",
        "https://data.matr.io/1/api/v1/file/5c86bf13fa2ede00015ddd82/download",
        "https://data.matr.io/1/api/v1/file/5c86bd64fa2ede00015ddbb2/download"]

output_paths = ["data/2017-05-12_batchdata_updated_struct_errorcorrect.mat",
               "data/2017-06-30_batchdata_updated_struct_errorcorrect.mat",
               "data/2018-04-12_batchdata_updated_struct_errorcorrect.mat"] 


if not os.path.exists("./data/"):
    os.makedirs("./data/")

    
for i in range(len(urls)):
    
    url = urls[i]
    path = output_paths[i]
    if not os.path.isfile(path):
        response = requests.get(url, stream=True)
        
        if response.status_code == 200:
            with open(path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # filter out keep-alive chunks
                        f.write(chunk)
            print(f"Download completed: {path}")
        else:
            print(f"Failed to download file. Status code: {response.status_code}")
    else:
        print(f"{path.replace('data/','')} is already downloaded in the data folder")


## Convert to PKL

This code converts the raw matplotlib data to pkl files with the features that will be used later for training.

In [None]:
def process_batch(f, batch, batch_num):
    num_cells = batch['summary'].shape[0]
    bat_dict = {}
    for i in range(num_cells):
        cl = f[batch['cycle_life'][i,0]][()]
        policy = f[batch['policy_readable'][i,0]][()].tobytes()[::2].decode()
        summary_IR = np.hstack(f[batch['summary'][i,0]]['IR'][0,:].tolist())
        summary_QC = np.hstack(f[batch['summary'][i,0]]['QCharge'][0,:].tolist())
        summary_QD = np.hstack(f[batch['summary'][i,0]]['QDischarge'][0,:].tolist())
        summary_TA = np.hstack(f[batch['summary'][i,0]]['Tavg'][0,:].tolist())
        summary_TM = np.hstack(f[batch['summary'][i,0]]['Tmin'][0,:].tolist())
        summary_TX = np.hstack(f[batch['summary'][i,0]]['Tmax'][0,:].tolist())
        summary_CT = np.hstack(f[batch['summary'][i,0]]['chargetime'][0,:].tolist())
        summary_CY = np.hstack(f[batch['summary'][i,0]]['cycle'][0,:].tolist())
        summary = {'IR': summary_IR, 'QC': summary_QC, 'QD': summary_QD, 'Tavg':
                    summary_TA, 'Tmin': summary_TM, 'Tmax': summary_TX, 'chargetime': summary_CT,
                    'cycle': summary_CY}
        cycles = f[batch['cycles'][i,0]]
        cycle_dict = {}
        for j in range(cycles['I'].shape[0]):
            I = np.hstack((f[cycles['I'][j,0]][()]))
            Qc = np.hstack((f[cycles['Qc'][j,0]][()]))
            Qd = np.hstack((f[cycles['Qd'][j,0]][()]))
            Qdlin = np.hstack((f[cycles['Qdlin'][j,0]][()]))
            T = np.hstack((f[cycles['T'][j,0]][()]))
            Tdlin = np.hstack((f[cycles['Tdlin'][j,0]][()]))
            V = np.hstack((f[cycles['V'][j,0]][()]))
            dQdV = np.hstack((f[cycles['discharge_dQdV'][j,0]][()]))
            t = np.hstack((f[cycles['t'][j,0]][()]))
            cd = {'I': I, 'Qc': Qc, 'Qd': Qd, 'Qdlin': Qdlin, 'T': T, 'Tdlin': Tdlin, 'V':V, 'dQdV': dQdV, 't':t}
            cycle_dict[str(j)] = cd

        cell_dict = {'cycle_life': cl, 'charge_policy':policy, 'summary': summary, 'cycles': cycle_dict}
        key = f'b{batch_num}c' + str(i)
        bat_dict[key]=   cell_dict
    return bat_dict

In [None]:
import h5py
import scipy.io
import numpy as np
import pickle
import os
import csv

local_DIR = './data/'

matFilename_batch1 = os.path.join(local_DIR, '2017-05-12_batchdata_updated_struct_errorcorrect.mat')
matFilename_batch2 = os.path.join(local_DIR, '2017-06-30_batchdata_updated_struct_errorcorrect.mat')
matFilename_batch3 = os.path.join(local_DIR, '2018-04-12_batchdata_updated_struct_errorcorrect.mat')

f_batch1 = h5py.File(matFilename_batch1)
f_batch2 = h5py.File(matFilename_batch2)
f_batch3 = h5py.File(matFilename_batch3)

batch1 = f_batch1['batch']
batch2 = f_batch2['batch']
batch3 = f_batch3['batch']

bat_dict1 = process_batch(f_batch1, batch1, 1)
print("Batch 1/3 complete", end="\r")
bat_dict2 = process_batch(f_batch2, batch2, 2)
print("Batch 2/3 complete", end="\r")
bat_dict3 = process_batch(f_batch3, batch3, 3)
print("Batch 3/3 complete", end="\r")

In [None]:
# Write the converted files to disk for later use
ROOT_DIR = 'data/'

if not os.path.exists(ROOT_DIR):
    os.makedirs(ROOT_DIR)
    
with open(os.path.join(ROOT_DIR, 'batch1.pkl'),'wb') as fp:
        pickle.dump(bat_dict1,fp)

with open(os.path.join(ROOT_DIR, 'batch2.pkl'),'wb') as fp:
        pickle.dump(bat_dict2,fp)

with open(os.path.join(ROOT_DIR, 'batch3.pkl'),'wb') as fp:
        pickle.dump(bat_dict3,fp)

## Data Cleaning

We replicate the data cleaning from the [original notebook below](https://github.com/rdbraatz/data-driven-prediction-of-battery-cycle-life-before-capacity-degradation/blob/master/Load%20Data.ipynb)

In [None]:
import pickle
import numpy as np
import pandas as pd
import csv 

ROOT_DIR = 'data/'

batch1 = pickle.load(open(os.path.join(ROOT_DIR, 'batch1.pkl'), 'rb'))
#remove batteries that do not reach 80% capacity
del batch1['b1c8']
del batch1['b1c10']
del batch1['b1c12']
del batch1['b1c13']
del batch1['b1c22']

batch2 = pickle.load(open(os.path.join(ROOT_DIR, 'batch2.pkl'),'rb'))

# There are four cells from batch1 that carried into batch2, we'll remove the data from batch2
# and put it with the correct cell from batch1
batch2_keys = ['b2c7', 'b2c8', 'b2c9', 'b2c15', 'b2c16']
batch1_keys = ['b1c0', 'b1c1', 'b1c2', 'b1c3', 'b1c4']
add_len = [662, 981, 1060, 208, 482];
for i, bk in enumerate(batch1_keys):
    batch1[bk]['cycle_life'] = batch1[bk]['cycle_life'] + add_len[i]
    for j in batch1[bk]['summary'].keys():
        if j == 'cycle':
            batch1[bk]['summary'][j] = np.hstack((batch1[bk]['summary'][j], batch2[batch2_keys[i]]['summary'][j] + len(batch1[bk]['summary'][j])))
        else:
            batch1[bk]['summary'][j] = np.hstack((batch1[bk]['summary'][j], batch2[batch2_keys[i]]['summary'][j]))
    last_cycle = len(batch1[bk]['cycles'].keys())
    for j, jk in enumerate(batch2[batch2_keys[i]]['cycles'].keys()):
        batch1[bk]['cycles'][str(last_cycle + j)] = batch2[batch2_keys[i]]['cycles'][jk]


del batch2['b2c7']
del batch2['b2c8']
del batch2['b2c9']
del batch2['b2c15']
del batch2['b2c16']


batch3 = pickle.load(open(os.path.join(ROOT_DIR, 'batch3.pkl'),'rb'))
# remove noisy channels from batch3
del batch3['b3c37']
del batch3['b3c2']
del batch3['b3c23']
del batch3['b3c32']
del batch3['b3c42']
del batch3['b3c43']


bat_dict = {**batch1, **batch2, **batch3}

# write metadata out to disk
metadata = [(bat, bat_dict[bat]['cycle_life'][0][0], bat_dict[bat]['charge_policy']) for bat in bat_dict.keys()]
filename = "data/metadata.csv"
with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerows(metadata)

## Create Battery Pkl files

The below code concatenates the charging and discharging segement of each battery's cycle to create a continous capacity curve. It then uses a basic derivative to classify each point as either charging/discharging and adds a new feature dV/dT. The charging and discharging segements of each battery are written to seperate directories.

In [None]:
import pandas as pd
import os

C_ROOT_DIR = "data/charging/" 
D_ROOT_DIR = "data/discharging/"

if not os.path.exists(C_ROOT_DIR):
    os.makedirs(C_ROOT_DIR)

if not os.path.exists(D_ROOT_DIR):
    os.makedirs(D_ROOT_DIR)


C_bat_dfs = {}
D_bat_dfs = {}

count = 0

for bat in list(bat_dict.keys()):
    
    
    data_for_df = []
    for cycle_num in range(len(bat_dict[bat]['cycles'])):
        idx_switch = -1
        for idx in np.where(bat_dict[bat]['cycles'][str(cycle_num)]['I'] < 0)[0]:
            if (bat_dict[bat]['cycles'][str(cycle_num)]['I'][idx+1] < 0) and (bat_dict[bat]['cycles'][str(cycle_num)]['I'][idx+2] < 0) and (bat_dict[bat]['cycles'][str(cycle_num)]['I'][idx+10] < 0):
                idx_switch = idx
                break
        capacity = np.concatenate((bat_dict[bat]['cycles'][str(cycle_num)]['Qc'][:idx_switch], bat_dict[bat]['cycles'][str(cycle_num)]['Qc'][idx_switch - 1] - bat_dict[bat]['cycles'][str(cycle_num)]['Qd'][idx_switch:]))
        
        # each point that is recorded in a given cycle
        for point_idx in range(len(bat_dict[bat]['cycles'][str(cycle_num)]['I'])):
            data_for_df.append([
                bat_dict[bat]['cycles'][str(cycle_num)]['I'][point_idx],
                bat_dict[bat]['cycles'][str(cycle_num)]['V'][point_idx],
                bat_dict[bat]['cycles'][str(cycle_num)]['T'][point_idx],
                bat_dict[bat]['cycles'][str(cycle_num)]['t'][point_idx],
                capacity[point_idx],
                cycle_num,
                bat_dict[bat]['summary']["IR"][cycle_num]  # add IR
            ])
    
    
    df = pd.DataFrame(data=data_for_df, columns=['I', 'V', 'T', 't', 'Q', 'c', 'IR'])

    # add dVoltage/dTime
    volt = df['V'].to_numpy()
    charge = df['Q'].to_numpy()
    time = df['t'].to_numpy()

    V_ts = pd.Series(volt, index=time)
    charge_ts = pd.Series(charge,index=time)
    
    dV_dt = V_ts.diff(periods=10)
    dV_dt.fillna(0, inplace=True)
    dV_dt = dV_dt.values
    
    dQ_dt = charge_ts.diff(periods=1)
    dQ_dt.fillna(1, inplace=True)
    dQ_dt = dQ_dt.values

    charge_discharge = []
    prev_label = -1
    for val in dQ_dt:
        
        
        if val > 0:
            charge_discharge.append(1)
            prev_label = 1
        elif val < 0:
            charge_discharge.append(0)
            prev_label = 0
        elif val == 0:
            charge_discharge.append(prev_label)
       
    df['charging'] = charge_discharge
    df['dV/dt'] = dV_dt


    charge = []
    discharge = []
    for index, row in df.iterrows():
    
        if row['charging'] == 1:
            charge.append(row.to_list())
        elif row['charging'] == 0:
            discharge.append(row.to_list())

    charge_df = pd.DataFrame(charge, columns=df.columns)
    discharge_df = pd.DataFrame(discharge, columns=df.columns)
    
    charge_df.to_pickle(os.path.join(C_ROOT_DIR, f'{bat}.pkl'))
    discharge_df.to_pickle(os.path.join(D_ROOT_DIR, f'{bat}.pkl'))

    C_bat_dfs[bat] = charge_df
    D_bat_dfs[bat] = discharge_df
    
    count += 1
    print(f'Processed {count}/{len(bat_dict.keys())} batteries', end="\r")

Below gives a visual intution to how the battery is being split using the derivative.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# ['b1c32', 'b2c33', 'b2c47']
# Grab data for battery "b1c1"
charge_df = C_bat_dfs['b1c1']
discharge_df = D_bat_dfs['b1c1']

# Filter by target cycle
charge_curve = charge_df[charge_df['c'] == 400]['Q'].reset_index(drop=True)
discharge_curve = discharge_df[discharge_df['c'] == 400]['Q'].reset_index(drop=True)


# Create x-values
charge_x = range(len(charge_curve))
discharge_x = range(len(charge_curve), len(charge_curve) + len(discharge_curve))

# Plot with different colors
plt.plot(charge_x, charge_curve, label='Charge (positive derivative)', color='blue')
plt.plot(discharge_x, discharge_curve, label='Discharge (negative derivative)', color='red')

transition_index = len(charge_curve)
plt.axvline(x=transition_index, color='black', linestyle='--', label='Transition Point')

plt.xticks([])
plt.ylabel('Q', labelpad=5)
plt.legend()

plt.title('Concatenated Charge/Discharge Curve for b1c1 cycle 400')
plt.show()


## Create Train, Validation, and Test Set

The below code creates the train, validation, and test set used in our paper. Other train/validation/test set splits are possible.

In [None]:
train_bats = []
val_bats = []
test_bats = []

for bat, cycle_life, charging_policy in metadata:
    
    if bat in ['b1c0', 'b1c1', 'b1c2', 'b1c36', 'b1c37', 'b2c17']:
        train_bats.append(bat)
        
    elif bat in ['b1c5', 'b1c33', 'b2c34']:
        val_bats.append(bat)
    elif bat in ['b1c32', 'b2c33', 'b2c47']:
        test_bats.append(bat)
      
print("Train: ", train_bats)
print("Validation: ", val_bats)
print("Test: ", test_bats)

## Write Train, Validation, and Test to Disk

Creates a combinded df of the selected batteries; it does so separately for the charging and discharging batteries.

In [None]:
import os
import pandas as pd
def save_df(bats, name, ROOT_DIR):
    dfs = []
    for bat in bats:
        df = pd.read_pickle(os.path.join(ROOT_DIR, f'{bat}.pkl'))
        dfs.append(df)

    entire_df = pd.concat(dfs).reset_index(drop=True)
    entire_df.to_pickle(os.path.join(ROOT_DIR, f'{name}.pkl'))

In [None]:
C_ROOT_DIR = "data/charging/"
D_ROOT_DIR = "data/discharging/"

# charging
save_df(train_bats, "train_df", C_ROOT_DIR)
save_df(val_bats, "val_df", C_ROOT_DIR)
save_df(test_bats, "test_df", C_ROOT_DIR)

# discharging
save_df(train_bats, "train_df", D_ROOT_DIR)
save_df(val_bats, "val_df", D_ROOT_DIR)
save_df(test_bats, "test_df", D_ROOT_DIR)