In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
# Assuming the dataset is stored in a parquet file named 'series_train.parquet'
from concurrent.futures import ThreadPoolExecutor
import os
from tqdm import tqdm

In [2]:
import os
import pandas as pd

def load_data(dirname):
    ids = [fname for fname in os.listdir(dirname) if not fname.startswith('.')]  # 排除隐藏文件
    all_data = []
    
    for file in ids:
        file_path = os.path.join(dirname, file, 'part-0.parquet')
        df = pd.read_parquet(file_path)
        all_data.append((file, df))
    
    return all_data




In [13]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = [fname for fname in os.listdir(dirname) if not fname.startswith('.')]  # 排除隐藏文件
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

train_ts = load_time_series("series_train.parquet")
test_ts = load_time_series("series_test.parquet")


100%|██████████| 996/996 [00:22<00:00, 44.66it/s]
100%|██████████| 2/2 [00:00<00:00, 29.08it/s]


In [18]:
train_ts.to_csv('train_stat.csv')
test_ts.to_csv('test_stas.csv')


In [20]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

def extract_features(df):
    features = {}
    # Basic statistics for accelerometer data
    for axis in ['X', 'Y', 'Z']:
        features[f'{axis}_mean'] = df[axis].mean()
        features[f'{axis}_std'] = df[axis].std()
        features[f'{axis}_max'] = df[axis].max()
        features[f'{axis}_min'] = df[axis].min()
        features[f'{axis}_skew'] = skew(df[axis])
        features[f'{axis}_kurtosis'] = kurtosis(df[axis])
        features[f'{axis}_range'] = df[axis].max() - df[axis].min()
    
    # ENMO statistics
    features['enmo_mean'] = df['enmo'].mean()
    features['enmo_std'] = df['enmo'].std()
    features['enmo_zero_count'] = (df['enmo'] == 0).sum()  # Count periods of no motion
    
    # Angle-Z statistics
    features['anglez_mean'] = df['anglez'].mean()
    features['anglez_std'] = df['anglez'].std()
    
    # Light and battery statistics
    features['light_mean'] = df['light'].mean()
    features['light_std'] = df['light'].std()
    features['battery_voltage_mean'] = df['battery_voltage'].mean()
    features['battery_voltage_std'] = df['battery_voltage'].std()
    
    # Non-wear time proportion
    features['non_wear_ratio'] = df['non-wear_flag'].mean()
    
    # Time-based features
    df['hour'] = pd.to_datetime(df['time_of_day']).dt.hour
    features['active_hours'] = df.groupby('hour')['enmo'].mean().idxmax()  # Peak activity hour
    features['weekday_mean'] = df['weekday'].mean()
    
    # Quarter activity stats
    features['quarter_mean_enmo'] = df.groupby('quarter')['enmo'].mean().mean()
    features['quarter_std_enmo'] = df.groupby('quarter')['enmo'].mean().std()

    # Return extracted features
    return features

def process_all_series(dirname):
    ids = [fname for fname in os.listdir(dirname) if not fname.startswith('.')]  # Exclude hidden files
    all_features = []
    
    for file in ids:
        file_path = os.path.join(dirname, file, 'part-0.parquet')
        df = pd.read_parquet(file_path)
        
        # Drop irrelevant columns
        df.drop(['step'], axis=1, inplace=True)
        
        # Extract features for the series
        features = extract_features(df)
        features['id'] = file.split('=')[1]  # Extract ID from filename
        all_features.append(features)
    
    # Combine all features into a DataFrame
    feature_df = pd.DataFrame(all_features)
    return feature_df

# Load and process training data
train_features = process_all_series("series_train.parquet")
test_features = process_all_series("series_test.parquet")


In [21]:
train_features

Unnamed: 0,X_mean,X_std,X_max,X_min,X_skew,X_kurtosis,X_range,Y_mean,Y_std,Y_max,...,light_mean,light_std,battery_voltage_mean,battery_voltage_std,non_wear_ratio,active_hours,weekday_mean,quarter_mean_enmo,quarter_std_enmo,id
0,-0.478973,0.429476,1.159667,-3.298790,1.160968,1.103128,4.458457,-0.037643,0.518888,2.525316,...,41.468441,180.046051,3876.515869,121.027321,0.000000,13,4.015780,0.061542,,0d01bbf2
1,0.047866,0.523529,1.859814,-1.777734,-0.123026,-0.819444,3.637547,0.003234,0.441043,1.518311,...,68.818016,278.520935,3841.463379,165.178589,0.216525,14,3.809581,0.023325,0.004031,cefdb7fe
2,-0.088861,0.300414,1.017271,-2.163437,-0.223284,1.579890,3.180708,0.045154,0.371415,1.381445,...,9.674905,47.171303,3838.082031,145.005997,0.611183,20,4.096849,0.004798,,58391429
3,-0.080044,0.601258,1.148359,-1.962057,0.244349,-1.261574,3.110417,0.058017,0.595227,3.186745,...,132.968567,434.872345,3874.709473,133.966049,0.000000,16,3.851570,0.045412,,2ca2206f
4,-0.067303,0.282437,2.427422,-3.150714,0.473134,3.860535,5.578136,0.187596,0.528473,2.343212,...,10.622702,39.840015,3829.777344,147.438263,0.657652,12,4.015813,0.018893,,19455336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,0.034996,0.278136,1.962949,-2.375542,1.225656,5.813970,4.338491,0.382648,0.594951,1.764815,...,8.340189,46.997589,3854.255615,167.211823,0.763487,17,4.146753,0.008409,,43a7386d
992,-0.227848,0.505671,1.163996,-2.023180,0.466306,-0.707714,3.187176,-0.117374,0.523569,2.271134,...,23.489414,47.289898,4024.496338,94.727821,0.000000,18,4.140736,0.070964,,2840643b
993,0.470508,0.451261,2.261910,-0.989845,-1.241211,0.931505,3.251755,0.069204,0.434032,1.023213,...,35.851643,154.012482,3852.268066,163.722565,0.003582,13,3.916291,0.044411,,1b329556
994,0.001327,0.509230,2.621233,-2.147763,0.092827,-0.496259,4.768996,-0.233908,0.547428,2.709097,...,28.920029,151.741760,3845.557129,164.792923,0.307449,16,3.917826,0.033604,0.020764,62b873a2


In [22]:
train_features.to_csv('train_features.csv')
test_features.to_csv('test_features.csv')

In [23]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

def extract_time_series_features(df):
    features = {}
    
    # Rolling window features (e.g., 1-minute window assuming data is at 5-second intervals)
    window_size = 12  # 12 * 5 seconds = 1 minute
    for axis in ['X', 'Y', 'Z']:
        rolling_mean = df[axis].rolling(window=window_size).mean()
        rolling_std = df[axis].rolling(window=window_size).std()
        
        features[f'{axis}_rolling_mean_mean'] = rolling_mean.mean()
        features[f'{axis}_rolling_mean_std'] = rolling_mean.std()
        features[f'{axis}_rolling_std_mean'] = rolling_std.mean()
        features[f'{axis}_rolling_std_std'] = rolling_std.std()
    
    # Trend analysis (simple linear regression)
    time_steps = np.arange(len(df))
    for axis in ['X', 'Y', 'Z', 'enmo']:
        slope, intercept = np.polyfit(time_steps, df[axis].fillna(0), 1)
        features[f'{axis}_trend_slope'] = slope
        features[f'{axis}_trend_intercept'] = intercept
    
    # Frequency domain features (Fourier Transform)
    for axis in ['X', 'Y', 'Z']:
        fft = np.fft.fft(df[axis].fillna(0))
        fft_freq = np.fft.fftfreq(len(fft))
        
        # Keep only the positive frequencies
        fft_amplitude = np.abs(fft[fft_freq >= 0])
        fft_amplitude = fft_amplitude[1:]  # Exclude DC component
        
        features[f'{axis}_fft_max_amplitude'] = fft_amplitude.max()
        features[f'{axis}_fft_mean_amplitude'] = fft_amplitude.mean()
    
    # Activity count (using enmo)
    features['active_periods'] = (df['enmo'] > 0.1).sum()
    
    # Time-based aggregation
    df['hour'] = pd.to_datetime(df['time_of_day']).dt.hour
    hourly_activity = df.groupby('hour')['enmo'].mean()
    features['peak_activity_hour'] = hourly_activity.idxmax()
    features['peak_activity_enmo'] = hourly_activity.max()
    
    return features

def process_all_time_series(dirname):
    ids = [fname for fname in os.listdir(dirname) if not fname.startswith('.')]  # Exclude hidden files
    all_features = []
    
    for file in ids:
        file_path = os.path.join(dirname, file, 'part-0.parquet')
        df = pd.read_parquet(file_path)
        
        # Drop irrelevant columns
        df.drop(['step'], axis=1, inplace=True)
        
        # Extract features for the series
        features = extract_time_series_features(df)
        features['id'] = file.split('=')[1]  # Extract ID from filename
        all_features.append(features)
    
    # Combine all features into a DataFrame
    feature_df = pd.DataFrame(all_features)
    return feature_df

# Load and process training data
train_time_series_features = process_all_time_series("series_train.parquet")
test_time_series_features = process_all_time_series("series_test.parquet")


In [24]:
train_time_series_features.to_csv('train_time_series_features.csv')
test_time_series_features.to_csv('test_time_series_features.csv')

In [None]:
# 加载训练和测试数据
train_data = load_data("series_train.parquet")
test_data = load_data("series_test.parquet")


[('id=0d01bbf2',
            step         X         Y         Z      enmo     anglez  \
  0            0 -0.468869  0.412020 -0.236458  0.042506 -19.824650   
  1            1 -0.662526  0.533484  0.064034  0.052847   4.300246   
  2            2 -0.611384  0.227252 -0.150882  0.060734 -16.545208   
  3            3 -0.385799  0.552782 -0.500523  0.070440 -36.452175   
  4            4  0.016133  0.031981 -0.825109  0.081058 -67.488388   
  ...        ...       ...       ...       ...       ...        ...   
  269330  269330  0.004777  0.007798 -0.982809  0.002033 -89.554489   
  269331  269331  0.005717  0.005330 -0.981560  0.000000 -89.554489   
  269332  269332  0.005926  0.005908 -0.981014  0.000000 -89.554489   
  269333  269333 -0.767821  0.150101 -0.114455  0.061382 -10.840022   
  269334  269334 -0.636615 -0.005251 -0.005436  0.081237  25.589085   
  
          non-wear_flag      light  battery_voltage     time_of_day  weekday  \
  0                 0.0  27.666666      4179.000

In [10]:
train_data[3]

('id=2ca2206f',
           step         X         Y         Z      enmo     anglez  \
 0            0  0.076068  0.493229  0.865182  0.038120  57.853584   
 1            1  0.362995  0.502630  0.746667  0.104827  48.439461   
 2            2 -0.270521  0.789609  0.880729  0.540166  40.238838   
 3            3 -0.180234  0.765130  0.705573  0.381530  38.357533   
 4            4 -0.051667  0.187370 -0.892109  0.014824 -76.692711   
 ...        ...       ...       ...       ...       ...        ...   
 311954  311954 -0.044097  0.033030  0.102604  0.074686   2.631743   
 311955  311955 -0.197370  0.418125 -0.236458  0.050760 -26.208960   
 311956  311956 -0.436589  0.719479  0.275573  0.067066  18.968863   
 311957  311957 -0.543333  0.165573  0.240443  0.077716  16.537407   
 311958  311958 -0.096615  0.322526 -0.875260  0.102087 -45.708782   
 
         non-wear_flag      light  battery_voltage     time_of_day  weekday  \
 0                 0.0  43.333332           4175.0  58260000000

Total missing values: 0
