# README
The timers of the facilities are complex so that we use linear interpolation to get the records with a sampling frequency of 1 second.

In [13]:
import pandas as pd
import numpy as np
from scipy.interpolate import interp1d
from matplotlib.pyplot import MultipleLocator
import json
import time
import os
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import matplotlib
from scipy.integrate import trapz
import itertools
from collections import OrderedDict
from matplotlib import cm
from scipy.optimize import NonlinearConstraint
from scipy.signal import find_peaks, argrelextrema, peak_prominences, peak_widths
from scipy.stats import pearsonr


# Resample the electrochemical (EC) data

In [14]:
def convert_timeStamp_to_accumulatedSecond_EC(xs):
    tmp = time.strptime(xs, "%H:%M:%S")
    new_x = time.mktime(tmp)
    return new_x

def read_csv_without_duplicates(path, data_type):
    if data_type == 'LFP':
        df = pd.read_csv(path, encoding='gbk', sep='\t')
        df = df.drop_duplicates(['系统时间'])
        df['系统时间'] =  pd.to_datetime(df['系统时间'], format="%Y/%m/%d %H:%M:%S")
        df['测试时间'] =  pd.to_datetime(df['测试时间'], format="%H:%M:%S")
        df['步骤时间'] =  pd.to_datetime(df['步骤时间'], format="%H:%M:%S")
        state_columns = ['工步序号']
        df[state_columns] = df[state_columns].astype(float)
    elif data_type == 'LMO':
        df = pd.read_excel(path)
        df = df.drop_duplicates(['系统时间'])
        df['系统时间'] =  pd.to_datetime(df['系统时间'], format="%Y-%m-%d %H:%M:%S.%f")
        df['测试时间'] =  pd.to_datetime(df['测试时间'], format="%H:%M:%S.%f")
        df['步骤时间'] =  pd.to_datetime(df['步骤时间'], format="%H:%M:%S.%f")
        state_columns = ['工步序号']
        df[state_columns] = df[state_columns].astype(float)
    return df


def resample_df(df,data_type, timeon='系统时间', freq='1s'):
    '''
    Resample the dataframe using a given frequency
    '''
    helper = pd.DataFrame({timeon: pd.date_range(start=df[timeon].min(), end=df[timeon].max(),freq=freq)})
    new_df = pd.merge(df, helper, on=timeon, how='outer').sort_values(timeon)
    if data_type == 'LFP':
        numeric_columns = ['电流/A','容量/Ah','电压/V','充电容量']
        new_df[numeric_columns] = new_df[numeric_columns].interpolate(method='linear')
        # time_columns = ['测试时间','步骤时间']
        # new_df[time_columns] = new_df[time_columns].interpolate()
        
        # interpolate the time
        diff = new_df['系统时间'] - new_df['系统时间'][0]
        new_df['测试时间'] = new_df['测试时间'][0] + diff
                
        state_columns = ['工步序号']
        new_df[state_columns] = new_df[state_columns].ffill()
        new_df['工步状态'] = new_df['工步状态'].ffill()
    elif data_type == 'LMO':
        numeric_columns = ['电流/A','容量/Ah','电压/V','能量/Wh']
        new_df[numeric_columns] = new_df[numeric_columns].interpolate(method='linear')
        
        # interpolate the time
        diff = new_df['系统时间'] - new_df['系统时间'][0]
        new_df['测试时间'] = new_df['测试时间'][0] + diff
        state_columns = ['循环序号','工步序号']
        new_df[state_columns] = new_df[state_columns].ffill()
        new_df['工作模式'] = new_df['工作模式'].ffill()
        
    new_df = new_df.loc[new_df['系统时间'].isin(helper['系统时间'].values)]
    
    # Check if the frequency is right
    # new_df['测试时间'] = new_df['测试时间'].dt.floor('S') 
    new_df['测试时间'] = pd.to_datetime(new_df['测试时间'], format="%Y-%m-%d %H:%M:%S")
    test_time = np.diff(new_df['测试时间'].values)
    diff = test_time - test_time[0]
    diff = np.array([float(i) for i in diff])
    # aa = indices[diff>=1e-3]
    # if len(aa) >= 1:
    #     aa = np.insert(aa, 0, aa[0]-1)
    #     aa = np.insert(aa, 0, aa[0]-1)
    
    # print(aa)
    # print(diff[aa])
    # print(new_df['测试时间'].values[aa])
    # print(new_df['系统时间'].values[aa])
    assert np.all(diff<=1e-3)
    
    new_df['测试时间'] = new_df['测试时间'].apply(lambda x:x.strftime("%H:%M:%S"))
    if data_type == 'LFP':
        kept_columns = ['电流/A','容量/Ah','电压/V','充电容量'] + ['系统时间', '测试时间', '工步序号', '工步状态']
    elif data_type == 'LMO':
        kept_columns = ['电流/A','容量/Ah','电压/V','能量/Wh'] + ['循环序号','系统时间', '测试时间', '工步序号', '工作模式']
    new_df = new_df[kept_columns]
    return new_df

In [15]:
data_type = 'LFP' # change the data_type here: ['LMO', 'LFP']

# save the label
if data_type == 'LFP':
    files = os.listdir(f'../{data_type}_raw_data/label/')
    files = [i for i in files if i.endswith('txt')]
    file_label_dict = {}
    last_columns = []
    for file in files:
        name = file.split('.')[0].split('_')[0]
        df = pd.read_csv(f'../{data_type}_raw_data/label/{file}', encoding='gbk', sep='\t')
        # discharge_df = df.loc[df['工步序号'].isin([7])]
        # if name == '87':
        #     print(discharge_df['放电容量'])
        # if last_columns != ''.join(list(df.columns.values)):
        # last_columns = ''.join(list(df.columns.values))
        label = df['放电容量'].max()
        file_label_dict[name] = label

    with open(f'../{data_type}_raw_data/label/label.json', 'w') as f:
        json.dump(file_label_dict, f)
elif data_type == 'LMO':
    files = os.listdir(f'../{data_type}_raw_data/label/')
    files = [i for i in files if i.endswith('txt') or i.endswith('.xlsx')]
    file_label_dict = {}
    last_columns = []
    for file in files:
        print(file)
        name = file.split('.')[0].split('_#')[1]
        if file.endswith('.txt'):
            df = pd.read_csv(f'../{data_type}_raw_data/label/{file}', encoding='gbk', sep='\t')
            label = df['放电容量'].max()
        else:
            df = pd.read_excel(f'../{data_type}_raw_data/label/{file}')
            discharge_df = df.loc[df['工作模式']=='恒流放电']
            label = discharge_df['容量/mAh'].max() * 0.001 # conver the unit to Ah
        file_label_dict[name] = label

    with open(f'../{data_type}_raw_data/label/label.json', 'w') as f:
        json.dump(file_label_dict, f)
else:
    raise Exception('Wrong data type!')
# the label is saved

no_this_work_keys = ['5','30','31','35','4','22','36','46','2','76','6','29','59','71','81','62','55','57','42','85'] if data_type == 'LMO' else []
raw_data_path = f'../{data_type}_raw_data/test/'
if data_type == 'LFP':
    files = [i for i in os.listdir(raw_data_path) if i.endswith('.txt')]
elif data_type == 'LMO':
    files = [i for i in os.listdir(raw_data_path) if i.endswith('.xlsx')]
name_dfs = {}
for file in files:
    if data_type == 'LMO':
        key = file.split('.')[0]
        if key in no_this_work_keys:
            continue
    df = read_csv_without_duplicates(f'{raw_data_path}{file}', data_type)
    df = resample_df(df, data_type,timeon='系统时间')
    name_dfs[file] = df # record the resampled EC dfs
print(len(name_dfs))

192


# Resample the optical data according to the resampled EC data

In [16]:
def read_optical_without_duplicates(path, data_type, sequences, usecols):
    if data_type == 'LFP':
        df = pd.read_csv(path, sep='\t', header=0, names= ['Time', 'channel1','channel2','channel3','channel4']+sequences, usecols=usecols)
    elif data_type == 'LMO':
        df = pd.read_csv(path, sep='\t', header=0, names= ['Time']+sequences, usecols=usecols)
    df = df.drop_duplicates(['Time'])
    df['Time'] = df['Time'].apply(lambda x :datetime.strptime(x, '%Y/%m/%d %H:%M:%S.%f'))
    return df

data_path = f'../{data_type}_raw_data/optical/'
optical_name_dfs = {}
file_names = os.listdir(data_path)
for file_name in file_names:
    if data_type == 'LFP':
        battery_number = ''.join(file_name.split('.txt'))
        battery_number = battery_number.split('_')[-2:]
        start_number = int(battery_number[0])
        end_number = int(battery_number[1])
        sequences = [str(i) for i in range(start_number, end_number+1)]
        usecols=[i for i in range(13)]
    elif data_type == 'LMO':
        sequences = ['low_1', 'low_2', 'low_3', 'low_4', 'low_5', 'low_6', 
                        'low_7', 'low_8', 'w1','w2','w3','w4','w5','w6','w7','w8']
        usecols = [i for i in range(17)]
    df = read_optical_without_duplicates(f'{data_path}/{file_name}', data_type, sequences, usecols)
    print(file_name)
    if data_type == 'LFP':
        tmp = file_name.split('.')[0]
        start_number, end_number = tmp.split('_')[-2], tmp.split('_')[-1]
        for i in range(int(start_number), int(end_number)+1):
            tmp_df = df[['Time',str(i)]]
            tmp_df = tmp_df.rename(columns={str(i): 'optical'})
            optical_name_dfs[str(i)] = tmp_df # get the optical wavelength data of a cell
        # print(optical_name_dfs['163'])
    elif data_type == 'LMO':
        tmp = file_name.split('.')[0]
        keys = tmp.split('_')
        for index, key in enumerate(keys):
            if key in no_this_work_keys:
                continue
            tmp_index = index + 1
            tmp_df = df[['Time',f'w{tmp_index}']]
            tmp_df = tmp_df.rename(columns={f'w{tmp_index}': 'optical'})
            optical_name_dfs[key] = tmp_df


optical_fiber_73_80.txt
optical_fiber_177_184.txt
optical_fiber_137_144.txt
optical_fiber_145_152.txt
optical_fiber_9_16.txt
optical_fiber_25_32.txt
optical_fiber_17_24.txt
optical_fiber_113_120.txt
optical_fiber_65_72.txt
optical_fiber_89_96.txt
optical_fiber_161_168.txt
optical_fiber_41_48.txt
optical_fiber_57_64.txt
optical_fiber_153_160.txt
optical_fiber_169_176.txt
optical_fiber_121_128.txt
optical_fiber_129_136.txt
optical_fiber_49_56.txt
optical_fiber_105_112.txt
optical_fiber_97_104.txt
optical_fiber_33_40.txt
optical_fiber_185_192.txt
optical_fiber_81_88.txt
optical_fiber_1_8.txt


In [17]:
# refix the optical data for LFP cells
if data_type == 'LFP':
    refix_optical_data_path = '../LFP_raw_data/optical_refix/'
    refix_optical_files = os.listdir(refix_optical_data_path)
    for file in refix_optical_files:
        key = file.split('.')[0]
        if file != 'optical_fiber_31_NA1_NA2_NA3_NA4_NA5_NA6_NA7.txt':
            tmp_keys = key.split('_')[2:] # 找到重测的电池编号
            usecols=[i for i in range(13)]
            df = pd.read_csv(f'{refix_optical_data_path}{file}', sep = '\t', header=0, names= ['Time', 'channel1','channel2','channel3','channel4']+tmp_keys, usecols=usecols)
        else:
            tmp_keys = ['31'] # 找到重测的电池编号
            usecols=[i for i in range(6)]
            df = pd.read_csv(f'{refix_optical_data_path}{file}', sep = '\t', header=0, names= ['Time', 'channel1','channel2','channel3','channel4']+['31'], usecols=usecols)
        df['Time'] = df['Time'].apply(lambda x :datetime.strptime(x, '%Y/%m/%d %H:%M:%S.%f'))
        for key in tmp_keys:
            tmp_df = df[['Time',key]]
            tmp_df = tmp_df.rename(columns={key: 'optical'})
            optical_name_dfs[key] = tmp_df
    # print(optical_name_dfs['163'])
elif data_type == 'LMO':
    refix_optical_data_path = '../LMO_raw_data/optical_refix'
    sequences = ['low_1','low_2','w1','w2']
    usecols = [i for i in range(5)]
    df = read_optical_without_duplicates(f'{refix_optical_data_path}/78_86.txt', data_type, sequences, usecols)
    keys = ['86', '78']
    for index, key in enumerate(keys):
        tmp_index = index + 1
        tmp_df = df[['Time',f'w{tmp_index}']]
        tmp_df = tmp_df.rename(columns={f'w{tmp_index}': 'optical'})
        optical_name_dfs[key] = tmp_df
    

In [18]:
# merge the optical data into the resampled df
def merge_EC_and_optical(EC_df, optical_df, EC_on='系统时间', optical_on='Time'):
    helper = pd.DataFrame({optical_on: EC_df[EC_on]})
    new_df = pd.merge(optical_df, helper, on=optical_on, how='outer').sort_values(optical_on)
    new_df = new_df.interpolate(method='linear')
    new_df = new_df.loc[new_df[optical_on].isin(helper[optical_on].values)] # resampled optical df
    
    EC_df['optical'] = new_df['optical'].values
    return EC_df
    
    
    
merged_name_dfs = {}
for name, df in name_dfs.items():
    if data_type == 'LFP':
        key = name.split('_')[0]
    elif data_type == 'LMO':
        key = name.split('.')[0]
    merged_df = merge_EC_and_optical(df, optical_name_dfs[key])
    merged_name_dfs[key] = merged_df

In [19]:
merged_name_dfs['1']

Unnamed: 0,电流/A,容量/Ah,电压/V,充电容量,系统时间,测试时间,工步序号,工步状态,optical
0,0.0,0.000,2.6127,0.0,2023-07-24 12:09:46,00:00:00,1.0,R,
1,0.0,0.000,2.6128,0.0,2023-07-24 12:09:47,00:00:01,1.0,R,
2,0.0,0.000,2.6128,0.0,2023-07-24 12:09:48,00:00:02,1.0,R,
3,0.0,0.000,2.6128,0.0,2023-07-24 12:09:49,00:00:03,1.0,R,
4,0.0,0.000,2.6128,0.0,2023-07-24 12:09:50,00:00:04,1.0,R,
...,...,...,...,...,...,...,...,...,...
20302,0.0,12.722,2.8544,0.0,2023-07-24 17:48:11,05:38:25,6.0,R,1525.11521
20303,0.0,12.722,2.8544,0.0,2023-07-24 17:48:12,05:38:26,6.0,R,1525.11521
20304,0.0,12.722,2.8544,0.0,2023-07-24 17:48:13,05:38:27,6.0,R,1525.11521
20305,0.0,12.722,2.8544,0.0,2023-07-24 17:48:14,05:38:28,6.0,R,1525.11521


# Save the resampled data

In [20]:
output_dir = f'../{data_type}_raw_data/optical_electrochemical_data/'
os.makedirs(output_dir, exist_ok=True)
for name, df in merged_name_dfs.items():
    if data_type == 'LFP':
        df.to_csv(f'{output_dir}{name}.csv',index=False)
    elif data_type == 'LMO':
        df.to_csv(f'{output_dir}{name}.csv',index=False)