### Prerequisites

In [1]:
# run if you're unsure if your packages are up to date.
import sys
!{sys.executable} -m pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting scipy>=1.13.1 (from -r requirements.txt (line 5))
  Using cached scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting sklearn (from -r requirements.txt (line 6))
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag

In [2]:
import pandas as pd
import os
import glob
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [3]:
data_directory = 'Data/Cycling/'
# Iterate through every folder and sub-folder in the specified directory looking for text files.
files = glob.iglob(data_directory +'**/*.txt', recursive=True)
Cycling_Tests = dict()

# Convert the csv files into pandas dataframes, and store in a dictionary with (Key=cell_date):(dataframe)
def retrieve_files():
    dataset = dict()
    for f in files:
        columns = ['time/s', 'Ecell/V', 'I/mA', 'Temperature/°C', 'cycle number']
        temp_df = pd.read_csv(f, sep='\t', usecols=columns, encoding='ISO-8859-1')
        dataset[os.path.basename(f)[3:17]] = temp_df # trims the key for use in the dictionary
        print(f'Successfully imported test {os.path.basename(f)[3:17]} with shape {temp_df.shape}')
    return dataset

Cycling_Tests = retrieve_files()
print("Done")

Successfully imported test NX001_21-08-23 with shape (269049, 5)
Successfully imported test NX001_31-07-23 with shape (120655, 5)
Successfully imported test NX002_21-08-23 with shape (268652, 5)
Successfully imported test NX002_31-07-23 with shape (120808, 5)
Successfully imported test NX006_21-08-23 with shape (207146, 5)
Successfully imported test NX006_31-07-23 with shape (120806, 5)
Successfully imported test RS001_21-08-23 with shape (228545, 5)
Successfully imported test RS001_31-07-23 with shape (120465, 5)
Successfully imported test RS006_21-08-23 with shape (229161, 5)
Successfully imported test RS006_31-07-23 with shape (120383, 5)
Successfully imported test SG003_07-02-23 with shape (693003, 5)
Successfully imported test SG004_07-02-23 with shape (227678, 5)
Successfully imported test SG007_07-02-24 with shape (645535, 5)
Successfully imported test SG007_21-08-23 with shape (460741, 5)
Successfully imported test SG007_31-07-23 with shape (115995, 5)
Successfully imported tes

### Utility Functions

In [4]:
def add_dates(dataset):
    """
    Copies date from key to a column in the dataframe.
    Args:
    dataset (dict): Contains {Key with date of cell test}:{Data from test}
    """
    # For test in collection of tests
    for key, df in dataset.items():
        # Add the end of the key (which just contains the date) to the column "date"
        df['date'] = pd.to_datetime(key[6:]) 
    return dataset

def filter_dataset(dataset, partial_key):
    """
    Returns a subset of the dataset containing only items matching the key specified.
    Args:
    dataset (dict): The dataset to filter.
    partial_key (str): The partial key (usually cell name) to match.
    """
    filtered_dataset = dict()
    # iterate over all items in the dataset
    for key, v in dataset.items():
        if partial_key in key: # eg. "SG009" would be in SG009_23-04-23
            filtered_dataset[key] = v # Add that entry to the filtered dataset
    return filtered_dataset

def combine_tests(dataset):
    """
    Returns a single dataframe with all tests performed on that cell.
    Cycle numbers and times are updated to reflect "total time spent under test conditions".
    This neglects any time elapsed between tests.
    Args:
    dataset (dict): Contains only tests (key:dataset pairs) performed on the same cell.
    """
    # Pulls the list of keys from the dictionary, extracts the test date and sorts in chronological order.
    sorted_keys = sorted(dataset.keys(), key=lambda x: pd.to_datetime(x[6:])) 
    # there's probably a better way of doing this, but since the codes contain the date might as well do it this way.
    dataframes_temp = []
    cumulative_time = 0
    cumulative_cycles = 0
    # for every test performed on the cell
    for key in sorted_keys:
        df = dataset[key] # copy the dataframe to the temp variable
        df['time/s'] = df['time/s'] + cumulative_time # update the time to reflect elapsed test time.
        df['cycle number'] = df['cycle number'] + cumulative_cycles # update the cycle no. to reflect elapsed cycles.
        cumulative_time = df['time/s'].max() # set the cumulative time for the next iteration.
        cumulative_cycles = df['cycle number'].max() # set the cumulative cycles to the next iteration.
        dataframes_temp.append(df) # copy this ammended dataframe to the temp list.
    combined_df = pd.concat(dataframes_temp, ignore_index=True) # merge all of the dataframes in the temp list into a single dataframe.
    return combined_df

def add_capacity(df):
    """
    Calculates the cycle capacity in mAh and mWh for a cell test and adds extra columns to the dataframe.
    Returns appended dataframe.
    """
    # Retrospectively, I've realised this is ridiculously inefficient.
    # Rather than merge the dataset, I could have just plotted the capacity vs cycle number directly from this...
    # Potentially saving a few hundred mb of RAM.
    grouped = df.groupby('cycle number')
    capacity = []
    # group the dataset by cycle number
    for cycle, group in grouped: # I'm kind of amazed that this works....
        # Having to debug this to know how/why/what it is doing is why I dislike implicit programming languages.
        group = group[group['I/mA'] < -400] # filter down the group to just the times which are discharging and exclude discharges that aren't tests.
        # discharge tests occur at 500mA minimum.
        times = group['time/s'].values
        currents = group['I/mA'].values
        voltage = group['Ecell/V'].values
        # Take the total dicharge over the cycle * time = capacity in mAh
        capacity_mAh = -np.trapz(currents, times)/3600.0 
        # this uses the composite trapezoidal rule, which is apparently more accurate than a simple trapezoidal.
        # I have no idea what the difference is.
        capacity_mWh = -np.trapz(currents*voltage, times)/3600.0 
        # I originally tried capacity_mAh * voltage. This doesn't work and creates massive and very wrong datasets.
        capacity.append((cycle, capacity_mAh, capacity_mWh))
    # Merge the capacity vs cycle number and the main dataset.
    capacity_df = pd.DataFrame(capacity, columns=['cycle number', 'capacity/mAh', 'capacity/mWh'])
    df = df.merge(capacity_df, on='cycle number', how='left')
    return df

def add_capacity_dataset(dataset):
    """"
    Adds the capacity column for all items in the dataset.
    Returns updated dataset.
    """
    for key, df in dataset.items():
        df = add_capacity(df) # calculate capacity for the test..
        dataset[key] = df # update this test with the calculated values.
    return dataset

def ListCells(dataset):
    """"
    Lists all unique cell names (cell)(id) in the dataset.
    """
    cells = []
    for item in dataset.keys():
        cell_name = item[0:5]
        if not(cell_name in cells):
            cells.append(cell_name)
    return cells

def ListCellTypes(dataset):
    """"
    Lists all unique cell types in the dataset.
    """
    cells = []
    for item in dataset.keys():
        cell_name = item[0:2]
        if not(cell_name in cells):
            cells.append(cell_name)
    return cells

### Plotting Functions

In [5]:
# this could be made less redundant and more efficient.
def plot_capacity_Wh_allcycles_tohtml(cycle_data, showfig=False):
    """
    Exports plots to HTML (Plot remains interactive). ShowFig outputs to Jupyter, but is slow.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_capacity_Wh_allcycles(cycle_data, showfig)
    fig.write_html(data_directory+"/Output/"+cycle_data.name+"_capacity_Wh.html")

def plot_capacity_Wh_allcycles(cycle_data, showfig=True):
    """
    Returns a pyplot figure with Capacity in mWh against cycle number.
    """
    fig = px.line(cycle_data, x=cycle_data['cycle number'], y=cycle_data['capacity/mWh'], color='date')
    if(showfig): fig.show()
    return fig


def plot_capacity_allcycles_tohtml(cycle_data, showfig=False):
    """
    Exports plots to HTML (Plot remains interactive). ShowFig outputs to Jupyter, but is slow.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_capacity_allcycles(cycle_data, showfig)
    fig.write_html(data_directory+"/Output/"+cycle_data.name+"_capacity.html")

def plot_capacity_allcycles_tosvg(cycle_data, showfig=False):
    """
    Exports plots to SVG (Static Vector Plot), ShowFig outputs to Jupyter, but is slow.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_capacity_allcycles(cycle_data, showfig)
    fig.write_image(data_directory+"/Output/"+cycle_data.name+"_capacity.svg")

def plot_capacity_allcycles_topdf(cycle_data, showfig=False):
    """
    Exports plots to PDF (Static Vector Plot), ShowFig outputs to Jupyter, but is slow.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_capacity_allcycles(cycle_data, showfig)
    fig.write_image(data_directory+"/Output/"+cycle_data.name+"_capacity.pdf")

def plot_capacity_allcycles_topng(cycle_data, showfig=False):
    """
    Exports plots to PNG (Static Image Plot), ShowFig outputs to Jupyter.
    """
    fig = plot_capacity_allcycles(cycle_data, showfig)
    fig.write_image(data_directory+"/Output/"+cycle_data.name+"_capacity.png")

def plot_capacity_allcycles(cycle_data, showfig=True):
    """
    Returns a pyplot figure with Capacity in mAh against cycle number.
    """
    fig = px.line(cycle_data, x=cycle_data['cycle number'], y=cycle_data['capacity/mAh'], color='date')
    if(showfig): fig.show()
    return fig

def plot_capacity_celltype_tohtml(cycle_data, type_name, showfig=False):
    """
    Exports plots to HTML (Plot remains interactive). ShowFig outputs to Jupyter, but is slow.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_capacity_celltype(cycle_data, showfig)
    fig.write_html(data_directory+"/Output/"+type_name+"_capacity.html")

def plot_capacity_celltype(capacity_data, showfig=False):
    """
    Outputs a plot with all tests performed on a given cell type.
    Input is a list of pyplot traces.
    """
    fig = go.Figure()
    for df in capacity_data:
        fig.add_trace(go.Line(x=df['cycle number'], y=df['capacity/mWh'], mode='lines'))
    fig.update_xaxes(title_text="Cycle Nmber")
    fig.update_yaxes(title_text="Capacity/mWh")
    fig.update_layout(title_text='Capacity vs. Cycle Number for Multiple Cells')
    if(showfig): fig.show()
    return fig

def output_capacity_Wh_allcycles(cycle_data, cell_name):
    """
    Returns a pyplot trace to be included in a plot with Capacity in mWh against cycle number.
    """
    string = 'capacity/mAh for cell +' + cell_name
    fig= go.Line(x=cycle_data['cycle number'], y=cycle_data['capacity/mAh'], mode='lines', name=string)
    return fig

def plot_cell_voltage(cycle_data):
    """
    Outputs an interactive plot of cell voltage with time. Cycle number indicated by colour.
    """
    fig = px.line(cycle_data, x=cycle_data['time/s'], y=cycle_data['Ecell/V'], color='cycle number')
    fig.update_xaxes(rangeslider_visible=True)
    fig.show()

def plot_cell_voltage_current(cycle_data, showfig=True):
    """
    Returns an interactive plot of voltage and current.
    """
    fig_voltage = px.line(cycle_data, x=cycle_data['time/s'], y=cycle_data['Ecell/V'], color='cycle number')
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    for trace in fig_voltage.data:
        fig.add_trace(trace, secondary_y=False)
    fig.add_trace(go.Line(x=cycle_data['time/s'], y=cycle_data['I/mA']*0.001, mode='lines', name='I/A'), secondary_y=True)
    fig.update_xaxes(rangeslider_visible=True)
    if(showfig): fig.show()
    return fig

def plot_cell_voltage_current_tohtml(cycle_data, showfig=False):
    """
    Outputs an interactive plot of cell voltage + current with time to a HTML file. Cycle number indicated by colour.
    ShowFig enables output to Jupyter.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_cell_voltage_current(cycle_data, showfig)
    fig.write_html(data_directory+"/Output/"+cycle_data.name+"_cycling.html")

def plot_cell_voltage_current_topng(cycle_data, showfig=False):
    """
    Outputs plot of cell voltage + current with time to a png file. Cycle number indicated by colour.
    ShowFig enables output to Jupyter.
    """
    # DO NOT USE ON LARGE DATASETS > 100k points
    fig = plot_cell_voltage_current(cycle_data, showfig)
    fig.write_image(data_directory+"/Output/"+cycle_data.name+"_cycling.png")

# add temperature plots.

In [6]:
# ONLY RUN ONCE!! CURRENTLY BUGGED WHEN RUNNING MULTIPLE TIMES AS DUPLICATES THE CAPACITY COLUMN
Cycling_Tests = add_dates(Cycling_Tests) # Add test dates in column to all data.
Cycling_Tests = add_capacity_dataset(Cycling_Tests) # Add capacity in mAh and mWh to all data.

In [7]:
"""
I've had the realisation that an easier way of doing this 
would've been to merge datasets in the same folder, since they're already separated by the structure...

but hey ho, it's been done this way. c'est la vie.
I guess the advantage is that it scales even if more cells get added in the future, and you don't need to care about folder structure.
"""
# there's a bug where when this is run multiple times sometimes the combine tests code will just add more cycles than it should...
cell_types = dict()
for cell_type in ListCellTypes(Cycling_Tests):
    cell_types[cell_type]=[]
    
for cell_name in ListCells(Cycling_Tests):
    cell = filter_dataset(Cycling_Tests, cell_name) # cell is recast from a string, to the dataset belonging to that cell
    cell = combine_tests(cell)
    cell.name = cell_name
    cell_types[cell_name[0:2]].append(cell)
    # uncomment when you want to plot these.
    #plot_capacity_allcycles_tohtml(cell)
    #plot_cell_voltage_current_tohtml(cell)
    #plot_capacity_allcycles_topng(cell)
    #plot_capacity_Wh_allcycles_tohtml(cell)

for cell_type in cell_types:
    dataset = []
    for cell in cell_types[cell_type]:
        dataset.append(cell)
    plot_capacity_celltype_tohtml(dataset, cell_type)



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [8]:
for cell_type in ListCellTypes(Cycling_Tests):
    # for each cell type
    all_tests_cell = []
    type = filter_dataset(Cycling_Tests, cell_type)

# TBD


# only thing its missing is the cell name in the legend...


In [9]:
# Have a think about whether this should be output to a separate processed folder or read from and sent back to the database.
# need to split the data to only plot 1/10th of points it plotting against time.

## Plotting capacity against cycles

# save this for later when plotting.

In [10]:
#Don't use these fns... too slow

def resample(df, time_column, rule):
    df[time_column] = pd.to_datetime(df[time_column], unit='s')
    return df.resample(rule, on=time_column).mean().reset_index()

def plot_all_cycles(df, battery_name):
    #sample the data as otherwise it takes forever to plot

    df_sampled = resample(df, 'time/s', '30s')
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=df['time/s'], y=df_sampled['Ecell/V'], mode='lines', name='Ecell/V'))
    fig.add_trace(go.Scatter(x=df['time/s'], y=df_sampled['I/mA'], mode='lines', name='I/mA'))
    fig.add_trace(go.Scatter(x=df['time/s'], y=df_sampled['Temperature/°C'], mode='lines', name='Temperature/°C'))

    cycles = df['cycle number'].unique()

    # Add shaded areas for each cycle
    for cycle in cycles:
        cycle_data = df[df['cycle number'] == cycle]
        cycle_start = cycle_data['time/s'].min()
        cycle_end = cycle_data['time/s'].max()

        fig.add_shape(
            type='rect',
            x0=cycle_start,
            x1=cycle_end,
            y0=0,
            y1=1, 
            yref='paper',
            fillcolor='LightSlateGrey' if cycle % 2 == 0 else 'LightSkyBlue',
            opacity=0.3,
            line_width=0,
        )

    # Customize layout
    fig.update_layout(
        title=f'Battery {battery_name}',
        xaxis_title='Time (s)',
        yaxis_title='Value',
        template='plotly_white',
        shapes=[],
    )

    # Show the plot
    fig.show()

# Archive

In [11]:
#Function to plot specified cycle
def plot_cycle(df, battery_name, cycle_number, show_temp):
    
    cycle_data = df[(df['cycle number'] == cycle_number)]

    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['Ecell/V'], mode='lines', name='Ecell/V'), secondary_y=False)
    fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['I/mA']*0.001, mode='lines', name='I/A'), secondary_y=True)
    if show_temp == True:
        fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['Temperature/°C'], mode='lines', name='Temperature/°C'))

    fig.update_layout(title=f'Cycle {cycle_number} for Battery {battery_name}', xaxis_title='Time (s)', yaxis_title='Value')
    fig.update_yaxes(title_text="Cell Voltage (V)", secondary_y=False)
    fig.update_yaxes(title_text="Current (A)", secondary_y=True)
    fig.show()

#Plot the specified cycle and the two cycles that come after it
#todo rewrite this, i don't like it.
#todo use the define y function to allow it to display temp, voltage and current on the same chart or alternatively use subplots if it works better
def plot_3_cycles_cyclenum(df, battery_name, cycle_number, show_temp):
    
    cycle_data = df[(df['cycle number'] == cycle_number) | (df['cycle number'] == cycle_number + 1) | (df['cycle number'] == cycle_number + 2)]
    fig = px.line(cycle_data, x=cycle_data['time/s'], y=cycle_data['Ecell/V'], color='cycle number')
    #fig = make_subplots(specs=[[{"secondary_y": True}]])
    #fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['Ecell/V'], mode='lines', name='Ecell/V'), secondary_y=False)
    fig.add_trace(go.Line(x=cycle_data['time/s'], y=cycle_data['I/mA']*0.001, name='I/A'))
    #if show_temp == True:
    #    fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['Temperature/°C'], mode='lines', name='Temperature/°C'))
    fig.update_layout(title=f'Cycle {cycle_number}, {cycle_number+1} and {cycle_number+2} for Battery {battery_name}', xaxis_title='Time (s)')
    fig.update_yaxes(title_text="Cell Voltage (V)", secondary_y=False)
    fig.update_yaxes(title_text="Current (A)", secondary_y=True)
    # I spent a fair few hours here attempting to use an iterative or other method to highlight at what point the cycle number changed.
    # This is important, as I need to know exactly how the cycle number was determined to be able to make accurate capacity calculations.
    # Then I realised there's a far simpler solution. Just change the colour based on cycle number.
    # For a small number of cycles, this is sufficient, but won't work for plotting large cycle counts as you soon run out of distinguishable colours.
    # ISSUE: I can't get this working without using plotly express.
    # haven't tried the below method but it may work.
    #for cycle in df['cycle number'].unique():
        #cycle_df = df[df['cycle number'] == cycle]
        #fig.add_trace(go.Scatter(
        #   x=species_df['[insert plotted value]'],
        #   y=species_df['[insert value]'],
        #   mode='markers',
        #   name=cycle,
        #   marker=dict(size=10, symbol='circle'),
    #))
    # alternatively, https://community.plotly.com/t/automatically-pick-colors-when-using-add-trace/59075
    fig.show()

def plot_3_cycles(df, battery_name, cycle_number, show_temp):
    
    cycle_data = df[(df['cycle number'] == cycle_number) | (df['cycle number'] == cycle_number + 1) | (df['cycle number'] == cycle_number + 2)]

    fig = make_subplots(specs=[[{"secondary_y": True}]])
    fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['Ecell/V'], name='Ecell/V'), secondary_y=False)
    fig.add_trace(go.Scatter(x=cycle_data['time/s'], y=cycle_data['I/mA']*0.001, mode='lines', name='I/A'), secondary_y=True)
    if show_temp == True:
        fig.add_trace(go.Scatter(x=cycle_data['cycle number'], y=cycle_data['Temperature/°C'], mode='lines', name='Temperature/°C'))
    fig.update_layout(title=f'Cycle {cycle_number}, {cycle_number+1} and {cycle_number+2} for Battery {battery_name}', xaxis_title='Time (s)')
    fig.update_yaxes(title_text="Cell Voltage (V)", secondary_y=False)
    fig.update_yaxes(title_text="Current (A)", secondary_y=True)

    fig.show()

stuff that'll come in handy later
https://towardsdatascience.com/resample-function-of-pandas-79b17ec82a78
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html
blog post on decimating data
https://www.geeksforgeeks.org/python-pandas-dataframe-resample/