In [6]:
import torch
import pandas as pd
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
# Check available devices
if torch.cuda.is_available():
    device = torch.device("cuda:1")
    print("Device name:", device)
print("PyTorch version:", torch.__version__)
config = {
    "ROOTS": "/home/fisa/stockage1/mindscan",
    "EMG": "/EMG",
    "Events": "/Events",
}

Device name: cuda:1
PyTorch version: 2.4.0+cu124


In [7]:
# EMG root (config is defined in another cell)
emg_root = Path(config["ROOTS"]) / config["EMG"].lstrip("/")
events_root = Path(config["ROOTS"]) / config["Events"].lstrip("/")
subject = "S01"
subject_dir = emg_root / subject

if not subject_dir.exists():
    raise FileNotFoundError(f"Subject directory not found: {subject_dir}")
# Read the EMG CSV file for Sequence_1
emg_file = subject_dir / "Sequence_01" / "emg.csv"
events_file = events_root / subject / "Sequence_01" / "classif.csv"
# Parse the EMG data correctly (it's semicolon-separated)
df_emg = pd.read_csv(emg_file, sep=';')

# Parse the events data correctly (it's also semicolon-separated)
df_events = pd.read_csv(events_file, sep=';')

# Clean column names (remove extra spaces)
df_emg.columns = df_emg.columns.str.strip()
df_events.columns = df_events.columns.str.strip()

# Create a new dataframe for aggregated data
df_aggregated = df_emg.copy()
df_aggregated.columns = df_aggregated.columns.str.strip()

# Assign class labels based on timestamp ranges
df_aggregated['Class'] = None

for idx, row in df_events.iterrows():
    timestamp_start = row['Timestamp Start']
    timestamp_end = row['Timestamp End']
    class_label = row['Class']
    
    # Find EMG entries within this timestamp range
    mask = (df_aggregated['EMG TIME'] >= timestamp_start) & (df_aggregated['EMG TIME'] <= timestamp_end)
    df_aggregated.loc[mask, 'Class'] = class_label

# Remove rows with no assigned class
df_aggregated = df_aggregated.dropna(subset=['Class'])

# Reset index
df_aggregated = df_aggregated.reset_index(drop=True)

# Output statistics
print("Aggregated DataFrame Statistics:")
print(f"Total samples: {len(df_aggregated)}")
print(f"Number of classes: {df_aggregated['Class'].nunique()}")
print(f"\nClass distribution:\n{df_aggregated['Class'].value_counts().sort_index()}")
print(f"\nDataFrame shape: {df_aggregated.shape}")
print(f"\nFirst few rows:\n{df_aggregated.head()}")

Aggregated DataFrame Statistics:
Total samples: 182990
Number of classes: 31

Class distribution:
Class
1.0     11657
2.0      7252
3.0      3672
4.0      5950
5.0      7440
6.0      6233
7.0      4722
8.0      9948
9.0      7723
10.0     6013
11.0    10893
12.0     9791
13.0     9277
14.0     1963
15.0     4974
16.0     2634
17.0     2140
18.0     3537
19.0     4219
20.0     9087
21.0     5006
22.0     4386
23.0     5215
24.0     6538
25.0     5068
26.0     2540
27.0     4176
28.0     5593
29.0     4680
30.0     4891
31.0     5772
Name: count, dtype: int64

DataFrame shape: (182990, 18)

First few rows:
   EMG TIME  Arm_Left EMG (mV)  Arm_Right EMG (mV)  Forearm_Left EMG (mV)  \
0  8.192201          -0.000168           -0.002182               0.021149   
1  8.192995          -0.001511           -0.022995               0.020981   
2  8.193789          -0.001511           -0.020981               0.019471   
3  8.194583          -0.001175            0.003861               0.019638   
4  

In [None]:
# Initialize a list to store all dataframes
all_dataframes = []

# Iterate through all subjects (S01 to S24)
for subject_num in range(1, 25):
    subject = f"S{subject_num:02d}"
    subject_dir = emg_root / subject
    
    # Check if subject directory exists
    if not subject_dir.exists():
        print(f"Subject directory not found: {subject_dir}, skipping...")
        continue
    
    # Iterate through all sequences (Sequence_01 to Sequence_10)
    for seq_num in range(1, 11):
        sequence = f"Sequence_{seq_num:02d}"
        emg_file = subject_dir / sequence / "emg.csv"
        events_file = events_root / subject / sequence / "classif.csv"
        
        # Check if both files exist
        if not emg_file.exists() or not events_file.exists():
            print(f"Files not found for {subject}/{sequence}, skipping...")
            continue
        
        try:
            # Parse EMG data
            df_emg_temp = pd.read_csv(emg_file, sep=';')
            df_emg_temp.columns = df_emg_temp.columns.str.strip()
            
            # Parse events data
            df_events_temp = pd.read_csv(events_file, sep=';')
            df_events_temp.columns = df_events_temp.columns.str.strip()
            
            # Create aggregated dataframe
            df_agg_temp = df_emg_temp.copy()
            df_agg_temp['Class'] = None
            df_agg_temp['Subject'] = subject
            df_agg_temp['Sequence'] = sequence
            
            # Assign class labels based on timestamp ranges
            for idx, row in df_events_temp.iterrows():
                timestamp_start = row['Timestamp Start']
                timestamp_end = row['Timestamp End']
                class_label = row['Class']
                
                mask = (df_agg_temp['EMG TIME'] >= timestamp_start) & (df_agg_temp['EMG TIME'] <= timestamp_end)
                df_agg_temp.loc[mask, 'Class'] = class_label
            
            # Remove rows with no assigned class
            df_agg_temp = df_agg_temp.dropna(subset=['Class'])
            
            # Append to list
            all_dataframes.append(df_agg_temp)
            # print(f"Processed {subject}/{sequence}: {len(df_agg_temp)} samples")
            
        except Exception as e:
            print(f"Error processing {subject}/{sequence}: {e}")
            continue

# Concatenate all dataframes
df_massive = pd.concat(all_dataframes, ignore_index=True)

Files not found for S13/Sequence_10, skipping...


In [None]:
# Separate features and labels
X = df_massive.drop(columns=['Class']).values.astype('float64')  # Convert to float64
y = df_massive['Class'].astype(int).values  # Convert to int to ensure numeric type
del df_massive  # Free up memory

# 80/20 train/eval split
X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train).float()
y_train_tensor = torch.from_numpy(y_train).float()
X_eval_tensor = torch.from_numpy(X_eval).float()
y_eval_tensor = torch.from_numpy(y_eval).float()

# Create datasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
eval_dataset = TensorDataset(X_eval_tensor, y_eval_tensor)

# Create dataloaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False)

print(f"Training samples: {len(train_loader.dataset)}")
print(f"Evaluation samples: {len(eval_loader.dataset)}")
print(f"Feature dimension: {X_train_tensor.shape[1]}")
print(f"Number of batches (train): {len(train_loader)}")
print(f"Number of batches (eval): {len(eval_loader)}")


NameError: name 'df_massive' is not defined