In [None]:
import pandas as pd
import numpy as np
import torch
import joblib
import matplotlib.pyplot as plt
import pickle
import os

In [None]:
# !pip install scikit-learn==1.2.2

In [None]:
from sklearn.model_selection import train_test_split
import sklearn
print(sklearn.__version__)

1.2.2


In [None]:
# Define variables
model = "RCL"
folder = f'{model}/'
output_folder = "output_files/"

# Function to read and filter CSV files from a folder based on a timestamp threshold
def read_and_filter_csv(folder_path, timestamp_threshold):
    dataframes = []

    # Iterate over files in the specified folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):  # Check if the file is a CSV file
            file_path = os.path.join(folder_path, filename)  # Construct the full file path
            df = pd.read_csv(file_path)  # Read the CSV file into a pandas DataFrame
            df = df[df['Timestamp'] >= timestamp_threshold]  # Filter rows based on timestamp threshold
            df['Source'] = filename  # Add a new column to indicate the source file
            dataframes.append(df)  # Append the filtered DataFrame to the list

    return dataframes  # Return a list of filtered DataFrames

# Read and filter all CSV files in the specified folder (adjust '7' as needed for timestamp threshold)
data = read_and_filter_csv(folder, 7)

# Concatenate all filtered DataFrames into a single DataFrame
train = pd.concat(objs=data[:], ignore_index=True)

In [None]:
# Extracting targets
targets_numpy = train["Label"]

# Extracting features
features_numpy = train.iloc[:, 1:-1]

# Selecting specific features
features_numpy = features_numpy[["Data1", "Data2"]]


In [None]:
def create_Windowed_Dataset(Time, Dataset, Labelset, windowsize, stepsize):
    # Initialize empty lists and variables
    end = Dataset.shape[0]
    num_columns = Dataset.shape[1]
    New_Dataset = []
    New_Labelset = []
    from_timestamps = []
    to_timestamps = []

    # Iterate through the Dataset with a sliding window
    for i in range(windowsize, end, stepsize):
        # Extract a window of data from Dataset and convert to a flattened numpy array
        row = Dataset.iloc[i-windowsize:i, :].to_numpy().flatten()

        # Record timestamps for the window
        from_timestamps.append(Time.iloc[i-windowsize])
        to_timestamps.append(Time.iloc[i])

        # Determine label based on model type ('blink' or otherwise)
        if model == "blink":
            if Labelset.iloc[i] == Labelset.iloc[i-8]:
                label = Labelset.iloc[i-8]
            else:
                label = 0
        else:
            if Labelset.iloc[i] == Labelset.iloc[i-8]:
                label = Labelset.iloc[i]
            else:
                label = 1

        # Check for NaN or Inf values in the window; skip if found
        if np.any(np.isinf(row)) or np.any(np.isnan(row)):
            continue

        # Append data to respective lists
        New_Dataset.append(row)
        New_Labelset.append(label)
    # Convert lists to numpy arrays
    New_Dataset = np.array(New_Dataset)
    New_Labelset = np.array(New_Labelset)

    # Return the processed data
    return New_Dataset, New_Labelset, from_timestamps, to_timestamps

New_dataset, New_Labelset, from_timestamps, to_timestamps= create_Windowed_Dataset(train['Timestamp'], features_numpy, targets_numpy, 50, 2)

In [None]:
# Convert New_dataset to a pandas DataFrame
New_dataset = pd.DataFrame(New_dataset)

# Convert New_Labelset to a pandas DataFrame with column name 'Label'
New_Labelset = pd.DataFrame(New_Labelset, columns=['Label'])

Label
1        73160
0         3758
2         3589
Name: count, dtype: int64

In [None]:
# !pip install imbalanced-learn

In [None]:
# Import necessary libraries from imblearn for resampling techniques
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Initialize RandomOverSampler with a random state for reproducibility
rus = RandomOverSampler(random_state=42)

# Resample the dataset using RandomOverSampler
x_rus, y_rus = rus.fit_resample(New_dataset, New_Labelset)

# Print original and resampled dataset shapes
print('Original dataset shape:\n', New_Labelset.value_counts())
print('Resampled dataset shape:\n', y_rus.value_counts())


original dataset shape:
 Label
1        73160
0         3758
2         3589
Name: count, dtype: int64
Resample dataset shape
 Label
0        73160
1        73160
2        73160
Name: count, dtype: int64


In [None]:
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1,n_estimators=100, oob_score=True)
# classifier_rf = RandomForestClassifier(bootstrap=True,max_depth=None,max_features='log2',min_samples_leaf=1,min_samples_split=2,random_state=42, n_jobs=-1,n_estimators=246, oob_score=True)

classifier_rf.fit(x_rus, y_rus)
print(f'OOB Accuracy: {classifier_rf.oob_score_}')

  classifier_rf.fit(x_rus, y_rus)


OOB Accuracy: 0.9959221796974668


In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint

# param_dist = {
#     'n_estimators': randint(100, 300),
#     'max_features': ['auto', 'log2'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': randint(2, 10),
#     'min_samples_leaf': randint(1, 4),
#     'bootstrap': [True, False]
# }

# random_search = RandomizedSearchCV(estimator=classifier_rf, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, verbose=2, random_state=42)
# random_search.fit(x_rus, y_rus)

# best_params = random_search.best_params_
# print(best_params)


In [None]:
filename = f'{model}.joblib'
# save model
joblib.dump(classifier_rf, filename)

['RCL_30.joblib']