#### 数据收集

In [1]:
import pandas as pd
import re
import subprocess

In [2]:
command_devices = "adb devices"
print(subprocess.check_output(command_devices, shell=True))

command = "adb shell dumpsys usagestats > usagestats.log"
subprocess.run(command, shell=True, check=True)

b'List of devices attached\r\n143028c2\tdevice\r\n\r\n'


CompletedProcess(args='adb shell dumpsys usagestats > usagestats.log', returncode=0)

In [3]:
# 打开文件并读取内容，指定编码方式为"utf-8"
with open('usagestats.log', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# 找到包含"Last 24 hour events"和"In-memory daily stats"的行，获取这两行之间的所有行
start_index = next(i for i, line in enumerate(lines) if 'Last 24 hour events' in line) + 1
end_index = next(i for i, line in enumerate(lines) if 'In-memory daily stats' in line)
selected_lines = lines[start_index:end_index]

In [4]:
# 初始化DataFrame
df = pd.DataFrame(columns=['time', 'type', 'package'])

# 对每一行使用正则表达式匹配time、type和package的值，并将匹配的值添加到DataFrame中
for line in selected_lines:
    match = re.search(r'time="(.*?)" type=(.*?) package=(.*?) ', line)
    if match:
        new_row = {'time': match.group(1), 'type': match.group(2), 'package': match.group(3)}
        df.loc[len(df)] = new_row

In [5]:
# 将DataFrame保存为csv文件
df.to_csv('usagestats.csv', index=False, encoding='utf-8')

#### 数据清洗

In [10]:
import pandas as pd

# 读取CSV文件到dataframe_temp
dataframe_temp = pd.read_csv('usagestats.csv')

# 删除'package'列，添加新的'status'列
# dataframe_temp = dataframe_temp.drop(columns='package')
dataframe_temp['status'] = ''
dataframe_temp = dataframe_temp.reindex(columns=['time', 'status', 'type', 'package'])

# 创建新的dataframe_train
dataframe_train = pd.DataFrame()

# 从dataframe_temp中提取特定时间段的数据，将这些数据的'status'设置为特定的状态
dataframe_temp['time'] = pd.to_datetime(dataframe_temp['time'])
dataframe_train_list = [
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 11:48:00') & (dataframe_temp['time'] <= '2024-06-06 11:55:00')].assign(status='walk'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 11:59:00') & (dataframe_temp['time'] <= '2024-06-06 12:05:00')].assign(status='bike'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:08:00') & (dataframe_temp['time'] <= '2024-06-06 12:10:00')].assign(status='run'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:12:00') & (dataframe_temp['time'] <= '2024-06-06 12:15:00')].assign(status='sit')
]
dataframe_train = pd.concat(dataframe_train_list)

# 创建新的dataframe_test
dataframe_test = pd.DataFrame()

# 从dataframe_temp中提取特定时间段的数据，将这些数据的'status'设置为特定的状态
dataframe_test_list = [
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:17:00') & (dataframe_temp['time'] <= '2024-06-06 12:18:00')].assign(status='sit'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:18:00') & (dataframe_temp['time'] <= '2024-06-06 12:19:00')].assign(status='run'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:19:00') & (dataframe_temp['time'] <= '2024-06-06 12:20:20')].assign(status='walk'),
    dataframe_temp[(dataframe_temp['time'] >= '2024-06-06 12:20:20') & (dataframe_temp['time'] <= '2024-06-06 12:24:00')].assign(status='bike')
]
dataframe_test = pd.concat(dataframe_test_list)

dataframe_train.to_csv('train_usagestats.csv', index=False, encoding='utf-8')
dataframe_test.to_csv('test_usagestats.csv', index=False, encoding='utf-8')