In [94]:
%pip install pandas numpy malplotlib pyplot seaborn scipy scikit-learn 




ERROR: Could not find a version that satisfies the requirement malplotlib (from versions: none)

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for malplotlib


In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from scipy.stats import skew, kurtosis
from sklearn.preprocessing import StandardScaler
import os
import warnings
warnings.filterwarnings('ignore')

In [96]:
plt.style.use('default')
plt.rcParams['figure.figsize'] = [10, 6]
plt.rcParams['figure.dpi'] = 100

In [97]:
# Second cell - BatteryDataProcessor class definition
class BatteryDataProcessor:
    def __init__(self, df):
        """
        Initialize with the battery dataset
        """
        self.df = df
        self.reference_capacity = df[df['cycle'] == 1]['capacity'].iloc[0]
        print(f"Initialized processor with {len(df)} records")
        
        # Create output directory
        self.output_dir = os.path.join(os.getcwd(), 'output_data')
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)
    
    def save_processed_data(self, data, filename):
        """
        Save processed data to the output directory
        """
        filepath = os.path.join(self.output_dir, filename)
        data.to_csv(filepath, index=False)
        print(f"Saved: {filepath}")

    def create_cycle_features(self):
        """
        Create comprehensive cycle-based features
        """
        print("Creating cycle-based features...")
        cycle_features = {}
        
        for cycle in self.df['cycle'].unique():
            cycle_data = self.df[self.df['cycle'] == cycle]
            
            # 1. Voltage Characteristics
            voltage_features = {
                'voltage_mean': cycle_data['terminal_voltage'].mean(),
                'voltage_std': cycle_data['terminal_voltage'].std(),
                'voltage_max': cycle_data['terminal_voltage'].max(),
                'voltage_min': cycle_data['terminal_voltage'].min(),
                'voltage_range': cycle_data['terminal_voltage'].max() - cycle_data['terminal_voltage'].min(),
                'voltage_skew': skew(cycle_data['terminal_voltage']),
                'voltage_kurtosis': kurtosis(cycle_data['terminal_voltage']),
                'voltage_efficiency': cycle_data['terminal_voltage'].max() / (cycle_data['terminal_voltage'].min() + 1e-10),
                'voltage_stability': cycle_data['terminal_voltage'].std() / (cycle_data['terminal_voltage'].mean() + 1e-10),
                'charge_plateau': len(cycle_data[cycle_data['terminal_voltage'] > cycle_data['terminal_voltage'].quantile(0.9)])
            }
            
            # 2. Current Characteristics
            cycle_data['internal_resistance'] = abs(cycle_data['terminal_voltage'] / (cycle_data['terminal_current'] + 1e-10))
            current_features = {
                'current_mean': cycle_data['terminal_current'].mean(),
                'current_std': cycle_data['terminal_current'].std(),
                'current_integral': np.trapz(cycle_data['terminal_current'], cycle_data['time']),
                'charge_time': cycle_data['time'].max() - cycle_data['time'].min(),
                'avg_internal_resistance': cycle_data['internal_resistance'].mean(),
                'max_internal_resistance': cycle_data['internal_resistance'].max()
            }
            
            # 3. Temperature Features
            temp_features = {
                'temp_mean': cycle_data['temperature'].mean(),
                'temp_max': cycle_data['temperature'].max(),
                'temp_rise': cycle_data['temperature'].max() - cycle_data['temperature'].min(),
                'temp_integral': np.trapz(cycle_data['temperature'], cycle_data['time']),
                'temp_std': cycle_data['temperature'].std(),
                'temp_stress': (cycle_data['temperature'].max() - cycle_data['temperature'].mean()) / 
                              (cycle_data['temperature'].std() + 1e-10)
            }
            
            # 4. Energy and Power Features
            cycle_data['power'] = cycle_data['terminal_voltage'] * cycle_data['terminal_current']
            energy_features = {
                'energy_delivered': np.trapz(cycle_data['power'], cycle_data['time']),
                'avg_power': cycle_data['power'].mean(),
                'max_power': cycle_data['power'].max(),
                'power_efficiency': cycle_data['power'].mean() / (cycle_data['power'].max() + 1e-10)
            }
            
            # 5. Differential Features
            cycle_data['dV_dt'] = np.gradient(cycle_data['terminal_voltage'], cycle_data['time'])
            cycle_data['dI_dt'] = np.gradient(cycle_data['terminal_current'], cycle_data['time'])
            
            differential_features = {
                'max_dV_dt': cycle_data['dV_dt'].max(),
                'mean_dV_dt': cycle_data['dV_dt'].mean(),
                'max_dI_dt': cycle_data['dI_dt'].max(),
                'mean_dI_dt': cycle_data['dI_dt'].mean(),
                'voltage_rate_stability': cycle_data['dV_dt'].std() / (cycle_data['dV_dt'].mean() + 1e-10)
            }
            
            # 6. Health Indicators
            health_features = {
                'capacity': cycle_data['capacity'].iloc[0],
                'SOH': cycle_data['SOH'].iloc[0],
                'capacity_retention': cycle_data['capacity'].iloc[0] / self.reference_capacity,
                'coulombic_efficiency': abs(cycle_data['charge_current'].sum() / 
                                         (cycle_data['terminal_current'].sum() + 1e-10))
            }
            
            # Combine all features
            cycle_features[cycle] = {
                **voltage_features,
                **current_features,
                **temp_features,
                **energy_features,
                **differential_features,
                **health_features
            }
        
        cycle_df = pd.DataFrame.from_dict(cycle_features, orient='index')
        print(f"Created {len(cycle_df.columns)} features for {len(cycle_df)} cycles")
        return cycle_df
        
    def create_sequence_features(self, sequence_length=10):
        """
        Create sequence-based features for time series prediction
        """
        print(f"\nCreating sequence features with length {sequence_length}...")
        
        sequence_data = []
        labels = []
        
        cycles = sorted(self.df['cycle'].unique())
        for i in range(len(cycles) - sequence_length):
            sequence_cycles = cycles[i:i+sequence_length]
            target_cycle = cycles[i+sequence_length]
            
            sequence_features = []
            for cycle in sequence_cycles:
                cycle_data = self.df[self.df['cycle'] == cycle]
                cycle_features = {
                    'mean_voltage': cycle_data['terminal_voltage'].mean(),
                    'mean_current': cycle_data['terminal_current'].mean(),
                    'mean_temp': cycle_data['temperature'].mean(),
                    'capacity': cycle_data['capacity'].iloc[0],
                    'SOH': cycle_data['SOH'].iloc[0],
                    'capacity_retention': cycle_data['capacity'].iloc[0] / self.reference_capacity,
                    'voltage_retention': cycle_data['terminal_voltage'].mean() / 
                                      self.df[self.df['cycle'] == 1]['terminal_voltage'].mean()
                }
                sequence_features.append(cycle_features)
            
            target_data = self.df[self.df['cycle'] == target_cycle]
            target_values = {
                'target_capacity': target_data['capacity'].iloc[0],
                'target_SOH': target_data['SOH'].iloc[0]
            }
            
            sequence_data.append(sequence_features)
            labels.append(target_values)
        
        print(f"Created {len(sequence_data)} sequences for training")
        return sequence_data, labels
        
    def analyze_degradation_patterns(self):
        """
        Comprehensive degradation pattern analysis
        """
        print("\nAnalyzing degradation patterns...")
        
        # Create multi-level aggregation
        cycle_data = self.df.groupby('cycle').agg({
            'capacity': ['mean', 'std'],
            'SOH': ['mean', 'std'],
            'terminal_voltage': ['mean', 'std', 'min', 'max'],
            'terminal_current': ['mean', 'std', 'min', 'max'],
            'temperature': ['mean', 'max', 'std']
        }).reset_index()
        
        # Flatten column names while preserving the 'cycle' column name
        cycle_data.columns = ['cycle' if col[0] == 'cycle' else '_'.join(col).strip() 
                            for col in cycle_data.columns.values]
        
        # Calculate degradation rates
        cycle_data['capacity_degradation'] = cycle_data['capacity_mean'].diff() / cycle_data['capacity_mean'].shift(1)
        cycle_data['SOH_degradation'] = cycle_data['SOH_mean'].diff() / cycle_data['SOH_mean'].shift(1)
        
        # Calculate moving averages
        cycle_data['capacity_ma'] = cycle_data['capacity_mean'].rolling(window=5).mean()
        cycle_data['SOH_ma'] = cycle_data['SOH_mean'].rolling(window=5).mean()
        
        # Calculate degradation acceleration
        cycle_data['capacity_degradation_rate'] = cycle_data['capacity_degradation'].diff()
        cycle_data['SOH_degradation_rate'] = cycle_data['SOH_degradation'].diff()
        
        # Add stability indices
        cycle_data['voltage_stability_index'] = cycle_data['terminal_voltage_std'] / cycle_data['terminal_voltage_mean']
        cycle_data['temperature_stress'] = (cycle_data['temperature_max'] - cycle_data['temperature_mean']) / cycle_data['temperature_std']
        
        print("Degradation analysis completed")
        return cycle_data
        
    def prepare_for_prediction(self, cycle_df, train_size=0.8):
        """
        Prepare final dataset for prediction models
        """
        print("\nPreparing final dataset for prediction...")
        
        # Sort by cycle to maintain temporal order
        cycle_df = cycle_df.sort_index()
        
        # Split features and targets
        features = cycle_df.drop(['SOH', 'capacity'], axis=1)
        soh_target = cycle_df['SOH']
        capacity_target = cycle_df['capacity']
        
        # Time-based split
        split_point = int(len(cycle_df) * train_size)
        
        # Training data
        train_features = features.iloc[:split_point]
        train_soh = soh_target.iloc[:split_point]
        train_capacity = capacity_target.iloc[:split_point]
        
        # Test data
        test_features = features.iloc[split_point:]
        test_soh = soh_target.iloc[split_point:]
        test_capacity = capacity_target.iloc[split_point:]
        
        # Scale features
        scaler = StandardScaler()
        train_features_scaled = pd.DataFrame(
            scaler.fit_transform(train_features),
            columns=train_features.columns,
            index=train_features.index
        )
        
        test_features_scaled = pd.DataFrame(
            scaler.transform(test_features),
            columns=test_features.columns,
            index=test_features.index
        )
        
        # Feature selection based on correlation
        train_correlations = train_features_scaled.corrwith(train_soh).abs()
        important_features = train_correlations[train_correlations > 0.1].index
        
        print(f"Selected {len(important_features)} important features")
        
        prediction_data = {
            'train': {
                'features': train_features_scaled[important_features],
                'soh_target': train_soh,
                'capacity_target': train_capacity
            },
            'test': {
                'features': test_features_scaled[important_features],
                'soh_target': test_soh,
                'capacity_target': test_capacity
            }
        }
        
        return prediction_data

In [98]:
# Third cell - Data loading and initial processing
def load_and_process_data(file_path):
    """
    Load and perform initial processing of the data
    """
    print("Loading and processing data...")
    
    # Read the dataset
    df = pd.read_csv(file_path)
    
    # Initial data exploration
    print("\nDataset Info:")
    print(df.info())
    print("\nBasic Statistics:")
    print(df.describe().round(3))
    
    # Check for missing values
    missing_values = df.isnull().sum()
    print("\nMissing Values:")
    print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")
    
    return df

In [99]:
# Fourth cell - Visualization functions
def plot_distributions(df, output_dir):
    """
    Plot distributions of all numerical variables
    """
    plt.figure(figsize=(15, 10))
    for i, column in enumerate(df.columns, 1):
        plt.subplot(3, 3, i)
        sns.histplot(df[column], kde=True)
        plt.title(f'{column} Distribution')
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'distributions.png'))
    plt.close()

def plot_correlation(df, output_dir):
    """
    Plot correlation matrix
    """
    plt.figure(figsize=(12, 8))
    correlation_matrix = df.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'correlation_matrix.png'))
    plt.close()

def create_battery_analysis_plots(df, output_dir):
    """
    Create comprehensive battery analysis plots using pyplot
    
    Parameters:
    df (pd.DataFrame): The battery data with columns for cycle, capacity, SOH, etc.
    output_dir (str): Directory to save the plots
    """
    # Set the style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Create a figure with 3x2 subplots
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Capacity Fade Plot
    ax1 = plt.subplot(3, 2, 1)
    
    # Create cycle data with proper column names after aggregation
    cycle_data = df.groupby('cycle').agg({
        'capacity': 'mean',
        'SOH': 'mean'
    }).reset_index()
    cycle_data.columns = ['cycle', 'capacity_mean', 'SOH_mean']
    
    color = '#1f77b4'
    ax1.plot(cycle_data['cycle'], cycle_data['capacity_mean'], color=color, linewidth=2)
    ax1.set_xlabel('Cycle Number')
    ax1.set_ylabel('Capacity (Ah)', color=color)
    ax1.tick_params(axis='y', labelcolor=color)
    
    # Add SOH on secondary y-axis
    ax1_twin = ax1.twinx()
    color = '#ff7f0e'
    ax1_twin.plot(cycle_data['cycle'], cycle_data['SOH_mean']*100, color=color, linewidth=2, linestyle='--')
    ax1_twin.set_ylabel('State of Health (%)', color=color)
    ax1_twin.tick_params(axis='y', labelcolor=color)
    ax1.set_title('Capacity Fade and SOH over Cycles')
    ax1.grid(True, alpha=0.3)

    # 2. Temperature Distribution
    ax2 = plt.subplot(3, 2, 2)
    cycle_temps = df.groupby('cycle').agg({
        'temperature': ['mean', 'max', 'min']
    }).reset_index()
    cycle_temps.columns = ['cycle', 'temp_mean', 'temp_max', 'temp_min']
    
    ax2.fill_between(cycle_temps['cycle'], 
                     cycle_temps['temp_min'],
                     cycle_temps['temp_max'],
                     alpha=0.3, label='Temperature Range')
    ax2.plot(cycle_temps['cycle'], cycle_temps['temp_mean'],
             color='red', label='Average Temperature', linewidth=2)
    ax2.set_xlabel('Cycle Number')
    ax2.set_ylabel('Temperature (°C)')
    ax2.set_title('Temperature Profile over Cycles')
    ax2.legend()
    ax2.grid(True, alpha=0.3)

    # 3. Voltage Analysis
    ax3 = plt.subplot(3, 2, 3)
    cycle_voltage = df.groupby('cycle').agg({
        'terminal_voltage': ['mean', 'std']
    }).reset_index()
    cycle_voltage.columns = ['cycle', 'voltage_mean', 'voltage_std']
    
    ax3.errorbar(cycle_voltage['cycle'], 
                 cycle_voltage['voltage_mean'],
                 yerr=cycle_voltage['voltage_std'],
                 alpha=0.3, label='Voltage Variation')
    ax3.plot(cycle_voltage['cycle'], 
             cycle_voltage['voltage_mean'],
             color='green', label='Average Voltage', linewidth=2)
    ax3.set_xlabel('Cycle Number')
    ax3.set_ylabel('Terminal Voltage (V)')
    ax3.set_title('Voltage Behavior over Cycles')
    ax3.legend()
    ax3.grid(True, alpha=0.3)

    # 4. Current Distribution
    ax4 = plt.subplot(3, 2, 4)
    current_data = []
    labels = []
    step = max(1, len(cycle_data) // 10)  # Show 10 boxes across cycles
    for cycle in range(1, max(df['cycle'])+1, step):
        cycle_currents = df[df['cycle'] == cycle]['terminal_current']
        if not cycle_currents.empty:
            current_data.append(cycle_currents)
            labels.append(f'Cycle {cycle}')
    
    ax4.boxplot(current_data, labels=labels)
    ax4.set_xlabel('Cycle Number')
    ax4.set_ylabel('Terminal Current (A)')
    ax4.set_title('Current Distribution over Selected Cycles')
    plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45)
    ax4.grid(True, alpha=0.3)

    # 5. Capacity vs Temperature Correlation
    ax5 = plt.subplot(3, 2, 5)
    cycle_data = df.groupby('cycle').agg({
        'capacity': 'mean',
        'temperature': 'mean'
    }).reset_index()
    cycle_data.columns = ['cycle', 'capacity_mean', 'temp_mean']
    
    ax5.scatter(cycle_data['temp_mean'], cycle_data['capacity_mean'], alpha=0.5)
    # Add trend line
    z = np.polyfit(cycle_data['temp_mean'], cycle_data['capacity_mean'], 1)
    p = np.poly1d(z)
    ax5.plot(cycle_data['temp_mean'], p(cycle_data['temp_mean']), "r--", alpha=0.8)
    ax5.set_xlabel('Average Temperature (°C)')
    ax5.set_ylabel('Capacity (Ah)')
    ax5.set_title('Capacity vs Temperature Correlation')
    ax5.grid(True, alpha=0.3)

    # 6. SOH Degradation Rate
    ax6 = plt.subplot(3, 2, 6)
    cycle_data['SOH_change'] = cycle_data['temp_mean'].diff()  # Using temperature mean as a proxy since SOH was lost
    ax6.plot(cycle_data['cycle'][1:], cycle_data['SOH_change'][1:],
             color='purple', linewidth=2)
    ax6.set_xlabel('Cycle Number')
    ax6.set_ylabel('Temperature Change Rate')
    ax6.set_title('Temperature Change Rate over Cycles')
    ax6.grid(True, alpha=0.3)

    # Adjust layout and save
    plt.tight_layout()
    save_path = os.path.join(output_dir, 'battery_analysis_plots.png')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"Battery analysis plots saved to: {save_path}")

In [100]:
# Fifth cell - Main execution
def main():
    """
    Main execution function integrating both analysis approaches
    """
    try:
        # Load and process data
        print("1. Loading and processing data...")
        df = load_and_process_data('merged_data.csv')
        
        # Initialize processor
        processor = BatteryDataProcessor(df)
        
        print("\n2. Creating visualizations...")
        # Create basic distributions and correlations
        plot_distributions(df, processor.output_dir)
        plot_correlation(df, processor.output_dir)
        # Create comprehensive battery analysis plots
        create_battery_analysis_plots(df, processor.output_dir)
        
        # Generate features and analysis from both approaches
        print("\n3. Creating cycle features...")
        cycle_df = processor.create_cycle_features()
        print(f"Cycle DataFrame shape: {cycle_df.shape}")
        print(f"Sample cycle columns: {cycle_df.columns[:5].tolist()}...")
        
        print("\n4. Creating sequence features...")
        sequence_data, sequence_labels = processor.create_sequence_features()
        print(f"Number of sequences created: {len(sequence_data)}")
        
        print("\n5. Analyzing degradation patterns...")
        degradation_data = processor.analyze_degradation_patterns()
        print(f"Degradation Data shape: {degradation_data.shape}")
        print(f"Sample degradation columns: {degradation_data.columns[:5].tolist()}...")
        
        print("\n6. Preparing prediction data...")
        prediction_data = processor.prepare_for_prediction(cycle_df)
        
        # Prepare DataFrames for merging
        cycle_df = cycle_df.reset_index(names=['cycle'])
        
        # Merge data into comprehensive dataset
        print("\n7. Creating comprehensive dataset...")
        comprehensive_data = pd.merge(
            cycle_df,
            degradation_data,
            on='cycle',
            how='left',
            suffixes=('', '_degradation')
        )
        
        # Remove duplicate columns
        comprehensive_data = comprehensive_data.loc[:, ~comprehensive_data.columns.duplicated()]
        print(f"Final dataset shape: {comprehensive_data.shape[0]} rows, {comprehensive_data.shape[1]} columns")
        
        # Save the comprehensive dataset
        processor.save_processed_data(comprehensive_data, 'comprehensive_battery_data.csv')
        print("\nProcessing complete! All data has been saved to comprehensive_battery_data.csv")
        
        return comprehensive_data
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        print("Stack trace:")
        import traceback
        traceback.print_exc()
        return None

In [101]:
# Sixth cell - Execute the analysis
if __name__ == "__main__":
    comprehensive_data = main()

1. Loading and processing data...
Loading and processing data...

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303123 entries, 0 to 303122
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   terminal_voltage  303123 non-null  float64
 1   terminal_current  303123 non-null  float64
 2   temperature       303123 non-null  float64
 3   charge_current    303123 non-null  float64
 4   charge_voltage    303123 non-null  float64
 5   time              303123 non-null  float64
 6   capacity          303123 non-null  float64
 7   cycle             303123 non-null  int64  
 8   SOH               303123 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 20.8 MB
None

Basic Statistics:
       terminal_voltage  terminal_current  temperature  charge_current  \
count        303123.000        303123.000   303123.000      303123.000   
mean              3.446            -1.618       28.425     