In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import polars as pl
from tsfresh import extract_features
from tsfresh import select_features

from tqdm import tqdm

In [2]:
df = pl.read_parquet('data/train_dataset_feature.parquet').to_pandas()

In [3]:
df.shape

(17148960, 71)

In [4]:
X = df.drop(['series_id', 'timestamp','onset','wakeup','state'], axis=1)
y = df['state']
onset = df['onset']
wakeup = df['wakeup']

In [5]:
features_filtered = select_features(X, y)


In [6]:
features_filtered.shape

(17148960, 48)

In [10]:
features_filtered


Unnamed: 0,step,enmo_1v_30m_std,anglez_1v_30m_mean,anglez_1v_30m_max,anglez_1v_30m_std,enmo_1v_120m_max,enmo_1v_120m_std,anglez_1v_120m_max,enmo_1v_480m_max,anglez_1v_480m_max,...,anglez_cos,enmo_1v_5m_std,enmo_1v_5m_max,enmo_1v_120m_mean,anglez_5m_mean,year,anglez_5m_max,anglez_30m_mean,day,minute_cos
0,0,1,0,2,0,6,0,3,6,4,...,0.997529,0,0,0,0,2018,0,0,18,-1.836970e-16
1,1,1,0,2,0,6,0,3,6,4,...,0.977599,0,0,0,0,2018,0,0,18,-1.836970e-16
2,2,1,0,2,0,6,0,3,6,4,...,0.978810,0,0,0,0,2018,0,0,18,-1.836970e-16
3,3,1,0,3,0,6,0,3,6,4,...,0.976830,0,0,0,0,2018,0,0,18,-1.836970e-16
4,4,1,0,3,0,6,0,3,6,4,...,0.933248,0,0,0,0,2018,0,0,18,-1.836970e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17148955,17148955,0,0,2,0,0,0,2,2,3,...,0.848097,0,0,0,0,2018,0,0,23,9.945219e-01
17148956,17148956,0,0,2,0,0,0,2,2,3,...,0.848862,0,0,0,0,2018,0,0,23,9.945219e-01
17148957,17148957,0,0,2,0,0,0,2,2,3,...,0.821605,0,0,0,0,2018,0,0,23,9.945219e-01
17148958,17148958,0,0,2,0,0,0,2,2,3,...,0.819939,0,0,0,0,2018,0,0,23,9.945219e-01


In [7]:
X = pl.from_pandas(features_filtered)
y = pl.from_pandas(y)
onset = pl.from_pandas(onset)
wakeup = pl.from_pandas(wakeup)

In [15]:
X.columns

['step',
 'enmo_1v_30m_std',
 'anglez_1v_30m_mean',
 'anglez_1v_30m_max',
 'anglez_1v_30m_std',
 'enmo_1v_120m_max',
 'enmo_1v_120m_std',
 'anglez_1v_120m_max',
 'enmo_1v_480m_max',
 'anglez_1v_480m_max',
 'anglez_1v_480m_mean',
 'enmo_5m_max',
 'anglez_30m_max',
 'enmo_120m_max',
 'anglez_120m_max',
 'anglez_1v_120m_mean',
 'anglez_1v_120m_std',
 'enmo_480m_max',
 'enmo_1v_480m_mean',
 'enmo_1v_480m_std',
 'anglez_480m_max',
 'enmo_30m_max',
 'enmo_1v_30m_mean',
 'enmo_1v_30m_max',
 'anglez_1v_5m_max',
 'anglez',
 'enmo',
 'literal',
 'hour',
 'hour_sin',
 'anglez_1v_5m_std',
 'hour_cos',
 'month_sin',
 'month_cos',
 'anglez_1v_480m_std',
 'enmo_1v_5m_mean',
 'anglez_sin',
 'anglez_1v_5m_mean',
 'anglez_cos',
 'enmo_1v_5m_std',
 'enmo_1v_5m_max',
 'enmo_1v_120m_mean',
 'anglez_5m_mean',
 'year',
 'anglez_5m_max',
 'anglez_30m_mean',
 'day',
 'minute_cos']

In [14]:
#Concatenate the features and the labels
df = pl.concat([X, y])
df = pl.concat([df, onset])
df = pl.concat([df, wakeup])


AttributeError: 'Series' object has no attribute '_df'

In [12]:
export_df = pl.y_st([X, y, onset, wakeup])

AttributeError: module 'polars' has no attribute 'y_st'

In [None]:
export_df.to_parquet('data/train_dataset_feature_selected.parquet')

In [7]:
from sklearn.datasets import make_classification
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, randint




In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

In [9]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)


In [10]:

# Define parameters
params = {
    'objective': 'binary',  # or 'multiclass' if you have more than two classes
    'metric': 'binary_logloss',  # or 'multi_logloss' for multiclass
    'num_leaves': 20,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'num_threads': 23,  # Adjust based on your machine's capability
    "histogram_pool_size": 1024,  # Adjust based on your machine's capability
    "max_bin": 255,  # Adjust based on your machine's capability
}


In [11]:
try:
    # Train the model
    lgbm_model = lgb.train(params, train_data, valid_sets=[test_data], num_boost_round=1000)
except Exception as e:
    print(e)
    print('Error in training. Check your parameters.')



: 

In [None]:

# Predict on the test set
y_pred = lgbm_model.predict(X_test, num_iteration=lgbm_model.best_iteration)

# Convert probabilities to binary output
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_binary)
print(f'Accuracy: {accuracy}')
