In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt


# odszumianie sygnału

df = pd.read_csv('../../data/partially processed/resampled_data_no_outliers.csv')
df.drop(columns=['outlier_label'], inplace=True)
df['epoch (ms)'] = pd.to_datetime(df['epoch (ms)'])
df.set_index('epoch (ms)', inplace=True)

def butter_lowpass_filter(data, cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)



# calate set duration
for i in df["set"].unique():
    duration = df[df["set"] == i].index[-1] - df[df["set"] == i].index[0]
    df.loc[df["set"] == i, "duration"] = duration.seconds
    
duration_data = df.groupby("category")["duration"].mean()
    




In [None]:
duration_data

In [2]:
df_filtered = df.copy()
columns_to_check = ["acc_x", "acc_y", "acc_z", "gyro_x", "gyro_y", "gyro_z"]

for cat, duration in duration_data.items():
    subset = df_filtered[df_filtered["category"] == cat]
    if len(subset) == 0:
        continue
    if cat == "heavy":
        fs = len(subset) / (duration/5)
    if cat == "medium":
        fs = len(subset) / (duration/10)
    
    cutoff = fs / 5.0
    for col in columns_to_check:
        filtered = butter_lowpass_filter(subset[col].values, cutoff, fs)
        df_filtered.loc[subset.index, col] = filtered


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

osdf = df[df["set"] == 20]
fsdf = df_filtered[df_filtered["set"] == 20]

plt.figure(figsize=(20, 10))
plt.plot(osdf["acc_y"], label="original")
plt.plot(fsdf["acc_y"], label="filtered")
plt.legend()
plt.show()




In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Prepare the data
X = df[columns_to_check]
y = df['label']

X_filtered = df_filtered[columns_to_check]
y_filtered = df_filtered['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_filtered, X_test_filtered, y_train_filtered, y_test_filtered = train_test_split(X_filtered, y_filtered, test_size=0.3, random_state=42)

# Train the Random Forest classifier on the original data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Train the Random Forest classifier on the filtered data
clf_filtered = RandomForestClassifier(n_estimators=100, random_state=42)
clf_filtered.fit(X_train_filtered, y_train_filtered)

# Make predictions
y_pred = clf.predict(X_test)
y_pred_filtered = clf_filtered.predict(X_test_filtered)

# Print classification reports
print("Classification report for original data:")
print(classification_report(y_test, y_pred))

print("Classification report for filtered data:")
print(classification_report(y_test_filtered, y_pred_filtered))

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import butter, filtfilt


# odszumianie sygnału

df = pd.read_csv('../../data/partially processed/resampled_data_no_outliers.csv')
df.drop(columns=['outlier_label'], inplace=True)
df['epoch (ms)'] = pd.to_datetime(df['epoch (ms)'])
df.set_index('epoch (ms)', inplace=True)

def butter_lowpass_filter(data, cutoff, fs, order=5):
    nyq = 0.5 * fs
    normal_cutoff = cutoff / nyq
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return filtfilt(b, a, data)



def calculate_set_duration(df):
    for i in df["set"].unique():
        duration = df[df["set"] == i].index[-1] - df[df["set"] == i].index[0]
        df.loc[df["set"] == i, "duration"] = duration.seconds
    
    duration_data = df.groupby("category")["duration"].mean()
    return duration_data

def apply_low_pass_filter(df, duration_data):
    df_filtered = df.copy()
    columns_to_check = ["acc_x", "acc_y", "acc_z", "gyro_x", "gyro_y", "gyro_z"]

    for cat, duration in duration_data.items():
        subset = df_filtered[df_filtered["category"] == cat]
        if len(subset) == 0:
            continue
        if cat == "heavy":
            fs = len(subset) / (duration / 5)
        if cat == "medium":
            fs = len(subset) / (duration / 10)

        cutoff = fs / 5.0
        for col in columns_to_check:
            filtered = butter_lowpass_filter(subset[col].values, cutoff, fs)
            df_filtered.loc[subset.index, col] = filtered

    return df_filtered

# Example usage:
duration_data = calculate_set_duration(df)
df_filtered = apply_low_pass_filter(df, duration_data)

    




In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# filepath: /Users/stanislaw/Docs/inzynierka/inzynierka/data/partially processed/filtered_data_sample.csv
df = pd.read_csv("/Users/stanislaw/Docs/inzynierka/inzynierka/data/partially processed/filtered_data_sample.csv")

# Select numeric columns for PCA
numeric_cols = ["acc_x","acc_y","acc_z","gyro_x","gyro_y","gyro_z"]
X = df[numeric_cols].values

# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA with 3 principal components
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Print explained variance
# print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# # Optional: plot explained variance
# plt.bar(range(1, len(pca.explained_variance_ratio_)+1), pca.explained_variance_ratio_)
# plt.xlabel('Principal Component')
# plt.ylabel('Explained Variance Ratio')
# plt.title('PCA Scree Plot')
# plt.show()

# PCA-transformed DataFrame with 3 principal components
df_pca = pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(3)])
print(df_pca.head())
df_pca.to_csv("/Users/stanislaw/Docs/inzynierka/inzynierka/data/partially processed/pca_transformed_data.csv", index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# filepath: /Users/stanislaw/Docs/inzynierka/inzynierka/data/partially processed/filtered_data_sample.csv
df = pd.read_csv("/Users/stanislaw/Docs/inzynierka/inzynierka/data/partially processed/filtered_data_sample.csv")

# 1. Separate numeric, non-numeric columns
numeric_cols = ["acc_x","acc_y","acc_z","gyro_x","gyro_y","gyro_z"]
non_numeric_cols = [col for col in df.columns if col not in numeric_cols]

# 2. Normalize + PCA
X = df[numeric_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# 3. Concatenate PCA columns with non-numeric columns
df_pca = pd.concat([df[non_numeric_cols].reset_index(drop=True),
                    pd.DataFrame(X_pca, columns=[f"PC{i+1}" for i in range(len(numeric_cols))])],
                   axis=1)

# Example target (assuming 'label' is what we want to predict)
target_col = "label"

def compare_random_forest(original_df, pca_df, numeric_cols, target_col):
    # Prepare original dataset
    X_orig = original_df[numeric_cols]
    y_orig = original_df[target_col]
    
    # Prepare PCA dataset (only PCA columns as features)
    pca_columns = [col for col in pca_df.columns if col.startswith("PC")]
    X_pca = pca_df[pca_columns]
    y_pca = pca_df[target_col]
    
    # Split original
    Xo_train, Xo_test, yo_train, yo_test = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42)
    # Split PCA
    Xp_train, Xp_test, yp_train, yp_test = train_test_split(X_pca, y_pca, test_size=0.2, random_state=42)
    
    # Random Forest on original
    rf_orig = RandomForestClassifier(n_estimators=50, random_state=42)
    rf_orig.fit(Xo_train, yo_train)
    orig_preds = rf_orig.predict(Xo_test)
    orig_accuracy = accuracy_score(yo_test, orig_preds)
    
    # Random Forest on PCA
    rf_pca = RandomForestClassifier(n_estimators=50, random_state=42)
    rf_pca.fit(Xp_train, yp_train)
    pca_preds = rf_pca.predict(Xp_test)
    pca_accuracy = accuracy_score(yp_test, pca_preds)
    
    print("Original data accuracy:", orig_accuracy)
    print("PCA data accuracy:", pca_accuracy)

# 4. Compare both datasets
compare_random_forest(df, df_pca, numeric_cols, target_col)