In [2]:
import pandas as pd
from scipy.signal import resample

In [3]:
def openarff(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()

    # Find where the actual data starts
    data_start = next(i for i, line in enumerate(lines) if line.strip() == "@data") + 1

    # Read only the data part
    df_train = pd.read_csv(
        filepath,
        skiprows=data_start, 
        header=None
    )

    return df_train

In [4]:
df_test = openarff(r'C:..\5_Data\Epilepsy\Epilepsy_TEST.ts')

In [5]:
df_test.iloc[:, -1] = df_test.iloc[:, -1].str.split(":").str[1]  # Extract label
label_counts = df_test.iloc[:, -1].value_counts()  
print(label_counts)


615
WALKING     37
RUNNING     37
EPILEPSY    34
SAWING      30
Name: count, dtype: int64


In [6]:
df_train = openarff(r'..\5_Data\Epilepsy\Epilepsy_Train.ts')


In [7]:
df_train.iloc[:, -1] = df_train.iloc[:, -1].str.split(":").str[1]  # Extract label
label_counts = df_train.iloc[:, -1].value_counts()  
print(label_counts)


615
WALKING     37
RUNNING     36
EPILEPSY    34
SAWING      30
Name: count, dtype: int64


In [8]:
epi_dim1 = openarff('../5_Data/Epilepsy/EpilepsyDimension1_TEST.arff')
epi_dim2 = openarff('../5_Data/Epilepsy/EpilepsyDimension2_TEST.arff')
epi_dim3 = openarff('../5_Data/Epilepsy/EpilepsyDimension3_TEST.arff')

epi_dim4 = openarff('../5_Data/Epilepsy/EpilepsyDimension1_TRAIN.arff')
epi_dim5 = openarff('../5_Data/Epilepsy/EpilepsyDimension2_TRAIN.arff')
epi_dim6 = openarff('../5_Data/Epilepsy/EpilepsyDimension3_TRAIN.arff')


In [9]:
def combine_dfs(epi_dim1, epi_dim2, epi_dim3):
    epi_dim1.columns = [str(col) + 'x' for col in epi_dim1.columns]
    epi_dim2.columns = [str(col) + 'y' for col in epi_dim2.columns]
    epi_dim3.columns = [str(col) + 'z' for col in epi_dim3.columns]

    combined_epi = pd.concat([epi_dim1, epi_dim2, epi_dim3], axis=1)

    sorted_num = []
    for i in range(0, 207):
        for j in range(3):
            sorted_num.append(i)


    sorted_alp = ['x', 'y', 'z']*207	
    test = [str(sorted_num[i])+ sorted_alp[i] for i in range(0, 621)]

    combined_epi = combined_epi[test]

    combined_epi.columns = [col for col in combined_epi.columns if col!= "206x"] + ["206x"]

    combined_epi.drop(['206y','206z'], axis=1, inplace=True)

    return combined_epi


In [10]:
df1 = combine_dfs(epi_dim1, epi_dim2, epi_dim3)
df1.to_csv('../5_Data/Epilepsy/Epilepsy_TEST_wl.csv', index=False)

df2 = combine_dfs(epi_dim4, epi_dim5, epi_dim6)
df2.to_csv('../5_Data/Epilepsy/Epilepsy_TRAIN_wl.csv', index=False)

In [11]:
df1.head(5)

Unnamed: 0,0x,0y,0z,1x,1y,1z,2x,2y,2z,3x,...,203x,203y,203z,204x,204y,204z,205x,205y,205z,206x
0,0.6,-1.72,-0.47,0.6,-1.28,0.02,0.6,1.49,-0.17,0.61,...,-0.69,-0.73,-0.36,-0.7,-1.76,-0.51,-0.7,1.03,-0.57,EPILEPSY
1,-0.35,-0.99,0.05,-0.44,-2.3,0.05,-0.86,-0.57,0.05,-0.64,...,-0.18,-0.25,-0.22,-0.16,-0.32,0.55,-0.14,-0.34,0.83,EPILEPSY
2,-0.08,-1.02,-0.03,-0.01,-2.12,-0.48,-0.53,-0.95,0.45,-0.49,...,0.34,-0.54,-0.54,0.32,-0.62,-0.62,0.32,-0.67,-0.56,EPILEPSY
3,-0.43,0.78,-0.44,-0.44,0.78,-0.18,-0.47,0.78,-0.4,-0.49,...,-0.34,-1.18,-0.6,-0.31,0.34,-0.46,-0.31,1.08,-0.59,EPILEPSY
4,-0.3,-2.4,-0.31,-0.81,0.8,-0.26,-0.55,0.18,-0.19,-0.81,...,-0.87,-2.36,-0.02,-0.81,0.59,-0.03,-0.8,0.33,-0.02,EPILEPSY


In [12]:
def del_labels(df):
    for label in df['206x'].unique():
        if label!="EPILEPSY":
           df["206x"].replace(label, "NON-EPILEPTIC",inplace=True)
    return df

In [13]:
del_labels(df1).to_csv('../5_Data/Epilepsy/Epilepsy_TEST.csv')
del_labels(df2).to_csv('../5_Data/Epilepsy/Epilepsy_TRAIN.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["206x"].replace(label, "NON-EPILEPTIC",inplace=True)


In [14]:
df1["206x"].value_counts()

206x
NON-EPILEPTIC    104
EPILEPSY          34
Name: count, dtype: int64

In [15]:
openarff(r'..\5_Data\wisdm\gyro\data_1601_gyro_watch.arff')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
0,A,0.095,0.380,0.505,0.020,0.000,0.000,0.000,0.00,0,...,0.340372,0.335999,-0.237022,-0.433605,0.915957,-0.236944,-0.432076,0.916513,2.39657,1601
1,A,0.090,0.360,0.510,0.040,0.000,0.000,0.000,0.00,0,...,0.349002,0.344518,-0.100601,-0.116337,0.960587,-0.100595,-0.116347,0.960683,2.63362,1601
2,A,0.050,0.440,0.465,0.030,0.010,0.005,0.000,0.00,0,...,0.301484,0.297610,0.014996,0.125376,0.732290,0.016144,0.125561,0.734050,2.01819,1601
3,A,0.090,0.440,0.405,0.065,0.000,0.000,0.000,0.00,0,...,0.364366,0.359683,-0.184709,-0.189978,0.926732,-0.183948,-0.189934,0.927496,2.78928,1601
4,A,0.070,0.380,0.535,0.015,0.000,0.000,0.000,0.00,0,...,0.321420,0.317290,-0.123869,-0.234282,0.761902,-0.124094,-0.233997,0.762491,2.16045,1601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,S,0.090,0.405,0.375,0.065,0.050,0.010,0.005,0.00,0,...,0.349131,0.344645,0.313536,-0.261967,-0.302366,0.310792,-0.257439,-0.299839,2.47689,1601
320,S,0.105,0.380,0.405,0.080,0.020,0.010,0.000,0.00,0,...,0.325287,0.321107,-0.072679,-0.114063,0.077212,-0.078794,-0.112862,0.087838,2.28158,1601
321,S,0.120,0.475,0.295,0.075,0.015,0.010,0.000,0.01,0,...,0.282542,0.278912,-0.033405,-0.019351,-0.112663,-0.029089,-0.020538,-0.107500,2.41284,1601
322,S,0.055,0.395,0.415,0.110,0.015,0.010,0.000,0.00,0,...,0.318096,0.314009,0.109900,-0.159541,-0.317991,0.105951,-0.168420,-0.321577,2.30844,1601


In [16]:
def read_accelerometer_data(file_path):
    """
    Reads accelerometer data from a text file and returns a pandas DataFrame.

    Parameters:
        file_path (str): Path to the text file.

    Returns:
        pd.DataFrame: A DataFrame containing the accelerometer data.
    """
    # Define column names
    columns = ["Person_ID", "Activity", "Timestamp", "X", "Y", "Z"]
    
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep=",", header=None, names=columns)
    
    # Remove the trailing semicolon from the last column
    df["Z"] = df["Z"].str.rstrip(";")
    
    # Convert columns to appropriate data types
    df["Person_ID"] = df["Person_ID"].astype(int)
    df["Activity"] = df["Activity"].astype(str)
    df["Timestamp"] = df["Timestamp"].astype(int)
    df["X"] = df["X"].astype(float)
    df["Y"] = df["Y"].astype(float)
    df["Z"] = df["Z"].astype(float)
    
    return df

In [17]:
def resample_hz(df):
    resampled_df_list = []
    for act in df.Activity.unique():
        act_df = df[df['Activity'] == act]
        original_len = len(act_df)
        resampled_len = int(16/20*original_len)
        resampled_df = resample(act_df[["X","Y","Z"]],resampled_len)
        resampled_df = pd.DataFrame(resampled_df, columns=["X","Y","Z"])
        resampled_df["Activity"] = act
        resampled_df_list.append(resampled_df)

    resampled_df = pd.concat(resampled_df_list)

    resampled_df.columns = ['accX', 'accY', 'accZ', 'Activity']
    return resampled_df

In [18]:
path = r'..\5_Data\wisdm\wisdm-dataset\wisdm-dataset\raw\watch\accel\data_1608_accel_watch.txt'
df = read_accelerometer_data(path)
resampled_df = resample_hz(df)
resampled_df.to_csv(r'..\5_Data\nonepi\wisdm-dataset8.csv', index=False)

In [19]:
activity_mapping = {
    'A': 'Walking',
    'B': 'Jogging',
    'C': 'Stairs',
    'D': 'Sitting',
    'E': 'Standing',
    'F': 'Typing',
    'G': 'Brushing Teeth',
    'H': 'Eating Soup',
    'I': 'Eating Chips',
    'J': 'Eating Pasta',
    'K': 'Drinking from Cup',
    'L': 'Eating Sandwich',
    'M': 'Kicking (Soccer Ball)',
    'O': 'Playing Catch w/Tennis Ball',
    'P': 'Dribbling (Basketball)',
    'Q': 'Writing',
    'R': 'Clapping',
    'S': 'Folding Clothes'
}

resampled_df['Activity'] = resampled_df['Activity'].replace(activity_mapping)
resampled_df.to_csv(r'..\5_Data\nonepi\wisdm-dataset8_wl.csv', index=False)