In [1]:
#Data Loading

import os
import scipy.io as sio

data_folder = './NASA dataset'
mat_files = [f for f in os.listdir(data_folder) if f.endswith(".mat")]

batteries = {}

for filename in mat_files:
    path = os.path.join(data_folder, filename)
    data = sio.loadmat(path, squeeze_me=True, struct_as_record=False)

    # Remove system keys
    keys = [k for k in data.keys() if not k.startswith("__")]

    # NASA files have only ONE useful variable (e.g., 'B0005')
    var_name = keys[0]  

    batteries[filename] = data[var_name]
    print(f" {filename} → structure loaded")

battery = next(iter(batteries.values()))
print("Loaded all files")

 B0005.mat → structure loaded
 B0006.mat → structure loaded
 B0007.mat → structure loaded
 B0018.mat → structure loaded
 B0025.mat → structure loaded
 B0026.mat → structure loaded
 B0027.mat → structure loaded
 B0028.mat → structure loaded
 B0029.mat → structure loaded
 B0030.mat → structure loaded
 B0031.mat → structure loaded
 B0032.mat → structure loaded
 B0033.mat → structure loaded
 B0034.mat → structure loaded
 B0036.mat → structure loaded
 B0038.mat → structure loaded
 B0039.mat → structure loaded
 B0040.mat → structure loaded
 B0041.mat → structure loaded
 B0042.mat → structure loaded
 B0043.mat → structure loaded
 B0044.mat → structure loaded
 B0045.mat → structure loaded
 B0046.mat → structure loaded
 B0047.mat → structure loaded
 B0048.mat → structure loaded
 B0049.mat → structure loaded
 B0050.mat → structure loaded
 B0051.mat → structure loaded
 B0052.mat → structure loaded
 B0053.mat → structure loaded
 B0054.mat → structure loaded
 B0055.mat → structure loaded
 B0056.mat

In [3]:
#first_cycle = battery.cycle[0]
#print(dir(first_cycle))
#print(dir(first_cycle.data))
#Above 3 commands are only to get idea of how data is structured. we have cycle fields containing (type → 'charge' / 'discharge' / 'impedance', time, ambient_temperature, data → contains actual measurements)
#inside cycle.data we have Voltage_measured, Current_measured, Temperature_measured, Voltage_charge, Current_charge, Time)
#battery has separate charge signals (Voltage_charge, Current_charge) but usually only Voltage_measured & Current_measured are used for degradation modeling



In [5]:
# Extract cycles + basic features into a DataFrame

import numpy as np
import pandas as pd

def extract_features(battery, name):
    rows = []

    for idx, cyc in enumerate(battery.cycle):
        d = cyc.data
        # Detect which fields exist 
        # Skip cycles with no voltage/current
        if hasattr(d, "Voltage_measured") and hasattr(d, "Current_measured"):
            V = np.array(d.Voltage_measured)
            I = np.array(d.Current_measured)

        elif hasattr(d, "Voltage_charge") and hasattr(d, "Current_charge"):
            V = np.array(d.Voltage_charge)
            I = np.array(d.Current_charge)

        else:
            # skip cycles with no useful data (impedance cycles)
            continue

        # Temperature 
        if hasattr(d, "Temperature_measured"):
            T = np.array(d.Temperature_measured)
        else:
            T = np.array([np.nan])

        #time (comment)
        if hasattr(d, "Time"): 
            t = np.array(d.Time)           
        else:
            t = np.nan

        # Capacity (only for discharge/reference cycles)
        if hasattr(d, "Capacity") and cyc.type.lower() == "discharge":
            cap = d.Capacity
        else:
            cap = np.nan

        # Store extracted features 
        row = {
            "battery": name,
            "cycle": idx,
            "V_min": V.min(),
            "V_max": V.max(),
            "V_mean": V.mean(),
            "I_min": I.min(), 
            "I_max": I.max(),
            "I_mean": I.mean(),
            "T_min": T.min(),
            "T_max": T.max(),
            "T_mean": T.mean(),
            "duration_s": t[-1]-t[0] if len(t)>1 else 0,
            "Capacity": cap
        }

        rows.append(row)

    return rows

all_rows = []

for filename, batt in batteries.items():
    simple_name = filename.replace(".mat", "")
    rows = extract_features(batt, simple_name)
    all_rows.extend(rows)


raw_df = pd.DataFrame(all_rows)
print(raw_df.head(20))
print(raw_df.shape)
print(f"Total rows loaded: {len(raw_df)}")

   battery  cycle     V_min     V_max    V_mean     I_min     I_max    I_mean  \
0    B0005      0  3.479394  4.209949  4.187420 -4.030268  1.514393  0.643455   
1    B0005      1  2.612467  4.191492  3.529829 -2.018015  0.000729 -1.818702   
2    B0005      2  3.001951  4.213016  4.058826 -3.361983  1.515178  0.949043   
3    B0005      3  2.587209  4.189773  3.537320 -2.016821  0.002927 -1.817560   
4    B0005      4  3.035879  4.212788  4.058139 -3.384408  1.516894  0.950529   
5    B0005      5  2.651917  4.188187  3.543737 -2.016574  0.001484 -1.816487   
6    B0005      6  3.066145  4.212924  4.058905 -3.412263  1.517503  0.952312   
7    B0005      7  2.592948  4.188461  3.543666 -2.015936  0.001547 -1.825589   
8    B0005      8  3.063766  4.212874  4.058330 -3.403625  1.516949  0.947728   
9    B0005      9  2.547420  4.188299  3.542343 -2.017426  0.001701 -1.826114   
10   B0005     10  3.059322  4.212764  4.059785 -3.394451  1.516510  0.932661   
11   B0005     11  2.520948 

In [7]:
df_clean = raw_df.copy()

# Ensure Capacity is numeric
df_clean['Capacity'] = pd.to_numeric(df_clean['Capacity'], errors='coerce')

# Interpolate Capacity linearly within each battery, , sorted by cycle
for b in df_clean['battery'].unique():
    temp = df_clean[df_clean['battery']==b].copy()
    temp['Capacity'] = temp['Capacity'].interpolate(method='linear')
    df_clean.loc[df_clean['battery']==b, 'Capacity'] = temp['Capacity']

# Quick check of capacity interpolation 
print(df_clean[['battery','cycle','Capacity']].head(350))


#batteries with first cycle as charge or impedance (as we gat nan capacity so not discharge) these values need to be filled
# below two lines show all batteries where Capacity is still NaN after interpolation i.e. those that had first cycle value as nan
#nan_capacity = df_clean[df_clean['Capacity'].isna()][['battery','cycle','Capacity']]
#print(nan_capacity)

# Fill first missing Capacity values within each battery using backward fill
df_clean['Capacity'] = df_clean.groupby('battery')['Capacity'].transform(lambda x: x.bfill())

#fill V/I/T/Voltage NaNs because these are measurement gaps
sensor_cols = ['V_min','V_max','V_mean', 'I_min','I_max', 'I_mean', 'T_min','T_max', 'T_mean', 'duration_s']

df_clean[sensor_cols] = (
    df_clean.groupby('battery')[sensor_cols]
            .transform(lambda x: x.ffill().bfill())
)
df_clean.isna().sum() #to check how many nans left



    battery  cycle  Capacity
0     B0005      0       NaN
1     B0005      1  1.856487
2     B0005      2  1.851407
3     B0005      3  1.846327
4     B0005      4  1.840838
..      ...    ...       ...
345   B0006      7  2.013285
346   B0006      8  2.006907
347   B0006      9  2.000528
348   B0006     10  2.007214
349   B0006     11  2.013899

[350 rows x 3 columns]


battery       0
cycle         0
V_min         0
V_max         0
V_mean        0
I_min         0
I_max         0
I_mean        0
T_min         0
T_max         0
T_mean        0
duration_s    0
Capacity      0
dtype: int64

In [10]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df_clean


Unnamed: 0,battery,cycle,V_min,V_max,V_mean,I_min,I_max,I_mean,T_min,T_max,T_mean,duration_s,Capacity
0,B0005,0,3.479394,4.209949,4.18742,-4.030268,1.514393,0.643455,24.167062,27.445134,25.324079,7597.875,1.856487
1,B0005,1,2.612467,4.191492,3.529829,-2.018015,0.0007285853,-1.818702,24.325993,38.982181,32.572328,3690.234,1.856487
2,B0005,2,3.001951,4.213016,4.058826,-3.361983,1.515178,0.949043,24.528515,29.341949,26.635623,10516.0,1.851407
3,B0005,3,2.587209,4.189773,3.53732,-2.016821,0.002927403,-1.81756,24.685948,39.033398,32.725235,3672.344,1.846327
4,B0005,4,3.035879,4.212788,4.058139,-3.384408,1.516894,0.950529,24.600174,29.553301,26.778176,10484.547,1.840838
5,B0005,5,2.651917,4.188187,3.543737,-2.016574,0.001483676,-1.816487,24.734266,38.818797,32.642862,3651.641,1.835349
6,B0005,6,3.066145,4.212924,4.058905,-3.412263,1.517503,0.952312,24.493346,29.45634,26.703204,10397.89,1.835306
7,B0005,7,2.592948,4.188461,3.543666,-2.015936,0.001547012,-1.825589,24.652244,38.762305,32.514876,3631.563,1.835263
8,B0005,8,3.063766,4.212874,4.05833,-3.403625,1.516949,0.947728,24.42017,29.481334,26.617004,10495.203,1.834954
9,B0005,9,2.54742,4.188299,3.542343,-2.017426,0.001701455,-1.826114,24.5187,38.665393,32.382349,3629.172,1.834646


In [11]:
df_clean.isna().sum() #to check how many nans left

battery       0
cycle         0
V_min         0
V_max         0
V_mean        0
I_min         0
I_max         0
I_mean        0
T_min         0
T_max         0
T_mean        0
duration_s    0
Capacity      0
dtype: int64

In [16]:
nominal_capacity = 2

# Compute SOH as percentage
df_clean['SOH'] = df_clean['Capacity'] / nominal_capacity * 100
df_clean

Unnamed: 0,battery,cycle,V_min,V_max,V_mean,I_min,I_max,I_mean,T_min,T_max,T_mean,duration_s,Capacity,SOH
0,B0005,0,3.479394,4.209949,4.18742,-4.030268,1.514393,0.643455,24.167062,27.445134,25.324079,7597.875,1.856487,92.824371
1,B0005,1,2.612467,4.191492,3.529829,-2.018015,0.0007285853,-1.818702,24.325993,38.982181,32.572328,3690.234,1.856487,92.824371
2,B0005,2,3.001951,4.213016,4.058826,-3.361983,1.515178,0.949043,24.528515,29.341949,26.635623,10516.0,1.851407,92.570367
3,B0005,3,2.587209,4.189773,3.53732,-2.016821,0.002927403,-1.81756,24.685948,39.033398,32.725235,3672.344,1.846327,92.316362
4,B0005,4,3.035879,4.212788,4.058139,-3.384408,1.516894,0.950529,24.600174,29.553301,26.778176,10484.547,1.840838,92.041911
5,B0005,5,2.651917,4.188187,3.543737,-2.016574,0.001483676,-1.816487,24.734266,38.818797,32.642862,3651.641,1.835349,91.76746
6,B0005,6,3.066145,4.212924,4.058905,-3.412263,1.517503,0.952312,24.493346,29.45634,26.703204,10397.89,1.835306,91.765293
7,B0005,7,2.592948,4.188461,3.543666,-2.015936,0.001547012,-1.825589,24.652244,38.762305,32.514876,3631.563,1.835263,91.763126
8,B0005,8,3.063766,4.212874,4.05833,-3.403625,1.516949,0.947728,24.42017,29.481334,26.617004,10495.203,1.834954,91.747701
9,B0005,9,2.54742,4.188299,3.542343,-2.017426,0.001701455,-1.826114,24.5187,38.665393,32.382349,3629.172,1.834646,91.732275


In [18]:
# Save the cleaned dataframe
df_clean.to_csv('df_clean.csv', index=False)

In [20]:
# ADDING output target RUL COLUMN (Threshold = 80% SOH) 

# 1. Identify the cycle where SOH hits 80% for each battery
# Identify rows where SOH hits 80% (i.e. SOH <= 80) for each battery, then take the minimum cycle number for each battery as EOL cycle.
EoL_threshold_SoH = 80

# Find batteries that reach the threshold
complete_batteries = df_clean.groupby('battery')['SOH'].min()
complete_batteries = complete_batteries[complete_batteries <= EoL_threshold_SoH].index.tolist()

# Filter dataframe
df_complete = df_clean[df_clean['battery'].isin(complete_batteries)].copy()
print(f"Using {len(complete_batteries)} complete batteries for RUL prediction.")

# Find the specific cycle number where each battery "failed" i.e. SOH ≤ threshold (EOL cycle)
eol_cycles = df_complete[df_complete["SOH"] <= EoL_threshold_SoH].groupby("battery")["cycle"].min()

# Map EOL cycle back
df_complete["EOL_cycle"] = df_complete["battery"].map(eol_cycles)

# Compute RUL
df_complete["RUL"] = df_complete["EOL_cycle"] - df_complete["cycle"]
df_complete = df_complete[df_complete["RUL"] >= 0].reset_index(drop=True)

df_complete.to_csv("df_complete_with_rul.csv", index=False)
print("RUL column added for complete batteries only.")


Using 28 complete batteries for RUL prediction.
RUL column added for complete batteries only.
