In [1]:
import jupyter_black
from IPython.display import display


jupyter_black.load(line_length=999)

In [2]:
import os
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
class Person:
    features_type1 = [
        "ACCELERATION",
        "GYROSCOPE",
        "LIGHT",
        "LOCATION",
        "ORIENTATION",
        "PROXIMITY",
        "STEPS",
    ]

    features_type2 = [
        "BLUETOOTH",
        "CALL_RECORDS",
    ]

    def __init__(self, sid: str):
        self.sid = sid
        self.path = os.path.join(os.getcwd(), "data", sid)
        self.date_folders = os.listdir(self.path)

    def load(self, date_folder: str, feature: str):
        featur = feature.upper()
        file_path = os.path.join(self.path, date_folder, f"{feature}.txt")
        if os.path.exists(file_path):
            if feature in self.features_type1:
                return self._read_type1(file_path)
            elif feature in self.features_type2:
                return self._read_type2(file_path)
        else:
            return pd.DataFrame()

    @staticmethod
    def _read_type1(file_path: str):
        with open(file_path, "r") as file:
            lines = file.readlines()

        check = len(lines[0].strip().split("|"))
        columns = ["Timestamp"] + [f"Column{i}" for i in range(1, check)]
        data = []
        for line in lines:
            row = line.strip().replace(",", "").split("|")
            if len(row) == check:
                data.append(row)

        df = pd.DataFrame(data, columns=columns)
        for column in columns[1:]:
            try:
                df[column] = df[column].astype(float)
            except ValueError:
                pass
        return df

    @staticmethod
    def _read_type2(file_path: str):
        with open(file_path, "r") as file:
            lines = file.readlines()

        data = []
        current_timestamp = None
        max_columns = 0

        for line in lines:
            line = line.strip().replace(",", "")
            if line and line[0].isdigit() and len(line.split("|")) == 1:
                current_timestamp = line
            elif line:
                features = line.split("|")
                if len(features) == 3 and current_timestamp is not None:
                    data.append([current_timestamp] + features)
                    max_columns = max(max_columns, len(features) + 1)
                elif len(features) == 4:
                    data.append(features)
                    max_columns = max(max_columns, len(features))

        if max_columns == 4:
            columns = [f"Column{i}" for i in range(1, max_columns + 1)]
        else:
            columns = ["Timestamp"] + [f"Column{i}" for i in range(1, max_columns)]
        df = pd.DataFrame(data, columns=columns)
        return df

    def dump(self, dir_path: str, bar: bool = False):
        os.makedirs(dir_path, exist_ok=True)

        date_folders = tqdm(self.date_folders, desc="Processing by dates") if bar else self.date_folders
        for date_folder in date_folders:
            for feature in self.features_type1 + self.features_type2:
                df = self.load(date_folder, feature)
                if not df.empty:
                    df.insert(0, "SID", self.sid)
                    feature_dir = os.path.join(dir_path, feature)
                    os.makedirs(feature_dir, exist_ok=True)
                    output_path = os.path.join(feature_dir, f"{date_folder}.csv")

                    if os.path.exists(output_path):
                        existing_df = pd.read_csv(output_path)
                        combined_df = pd.concat([existing_df, df], ignore_index=True)
                        combined_df.to_csv(output_path, index=False)
                    else:
                        df.to_csv(output_path, index=False)

In [4]:
# for date_folder in tqdm(person.date_folders):
#     for feature in tqdm(person.features_type1):
#         df = person.load(date_folder, feature)

sids = ["gzj", "hm"]
for sid in tqdm(sids, desc="Processing by persons"):
    person = Person(sid)
    person.dump("data_csv", bar=True)

Processing by persons:   0%|          | 0/2 [00:00<?, ?it/s]

Processing by dates:   0%|          | 0/15 [00:00<?, ?it/s]

Processing by dates:   0%|          | 0/9 [00:00<?, ?it/s]