data_schema = {
    "time": {
        "description": "Timestamp in the format 'YYYY-MM-DD HH:MM:SS.FFF'",
        "type": "string",
        "pattern": "^\\d{4}-\\d{2}-\\d{2} \\d{2}:\\d{2}:\\d{2}\\.\\d{3}$"
    },
    "mood": {
        "description": "The mood scored by the user on a scale of 1-10",
        "type": "int",
        "range": (1, 10)
    },
    "circumplex.arousal": {
        "description": "The arousal scored by the user, on a scale between -2 to 2",
        "type": "int",
        "range": (-2, 2)
    },
    "circumplex.valence": {
        "description": "The valence scored by the user, on a scale between -2 to 2",
        "type": "int",
        "range": (-2, 2)
    },
    "activity": {
        "description": "Activity score of the user (number between 0 and 1)",
        "type": "float",
        "range": (0, 1)
    },
    "screen": {
        "description": "Duration of screen activity (time)",
        "type": "float",
        "range": None
    },
    "call": {
        "description": "Call made (indicated by a 1)",
        "type": "int",
        "range": (0, 1)
    },
    "sms": {
        "description": "SMS sent (indicated by a 1)",
        "type": "int",
        "range": (0, 1)
    },
    "appCat.builtin": {
        "description": "Duration of usage of builtin apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.communication": {
        "description": "Duration of usage of communication apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.entertainment": {
        "description": "Duration of usage of entertainment apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.finance": {
        "description": "Duration of usage of finance apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.game": {
        "description": "Duration of usage of game apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.office": {
        "description": "Duration of usage of office apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.other": {
        "description": "Duration of usage of other apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.social": {
        "description": "Duration of usage of social apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.travel": {
        "description": "Duration of usage of travel apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.unknown": {
        "description": "Duration of usage of unknown apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.utilities": {
        "description": "Duration of usage of utilities apps (time)",
        "type": "float",
        "range": None
    },
    "appCat.weather": {
        "description": "Duration of usage of weather apps (time)",
        "type": "float",
        "range": None
    }
}


import json
# 创建一个新的json对象，其中只有"time"键
new_data_schema = {"time": data_schema["time"]}

# 创建一个新的"variable"对象，其中包含除"time"之外的所有键值对
new_data_schema["variable"] = {key: value for key, value in data_schema.items() if key != "time"}

# new_data_schema现在是修改后的json
# 将new_data_schema保存为JSON
with open("../Assignment1/threshold.json", 'w') as f:
    json.dump(new_data_schema, f)

In [5]:
import pandas as pd
# 数据
data=pd.read_csv('../Assignment1//dataset_mood_smartphone.csv')
data.set_index(data.columns[0],inplace=True)
data.reset_index(inplace=True)
data.index += 1
data.head(5)

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
1,1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


In [7]:
import re
# 记录不符合规定的行索引
invalid_rows = []
data_schema=pd.read_json('../Assignment1//threshold.json')
# 遍历每一行
for index, row in data.iterrows():
    # 检查时间是否符合规定
    if not re.match(data_schema['time']['pattern'], row['time']):
        invalid_rows.append(index)
        continue

    # 检查变量是否符合规定
    variable = row['variable']
    value = row['value']
    if variable in data_schema['variable']:
        var_schema = data_schema['variable'][variable]
        if var_schema['range'] is not None:
            min_value, max_value = var_schema['range']
            if not min_value <= value <= max_value:
                invalid_rows.append(index)

print(len(invalid_rows))
print("invalid row id:", invalid_rows)

202
invalid row id: [5709, 5731, 5773, 5797, 5836, 6325, 6379, 6434, 6668, 6793, 7037, 7256, 7262, 7320, 7348, 7450, 8193, 8202, 8350, 8357, 8362, 8383, 8404, 8461, 8467, 8643, 9332, 9390, 9394, 9399, 9443, 9478, 9503, 9519, 9646, 9919, 10189, 10241, 10248, 10262, 10283, 10292, 10293, 10329, 10334, 11256, 11300, 11352, 11379, 11380, 11382, 11415, 11416, 11419, 11479, 11480, 11488, 11497, 11968, 12022, 12067, 12077, 12311, 12324, 12436, 12680, 12774, 12899, 12924, 13037, 13047, 13051, 13052, 13061, 13179, 13183, 13187, 13203, 13749, 13799, 13820, 13823, 13826, 13843, 13845, 13853, 13860, 13887, 13897, 13928, 13945, 13959, 13961, 13965, 13974, 13977, 13992, 13993, 14000, 14003, 14008, 14019, 14022, 14026, 14028, 14035, 14038, 14044, 14047, 14054, 14055, 14056, 14063, 14070, 14071, 14085, 14089, 14097, 14101, 14104, 14105, 14106, 14110, 14113, 14286, 14312, 14313, 14315, 14324, 14329, 14330, 14336, 14339, 14935, 14939, 14972, 14975, 14982, 14984, 15025, 15030, 15032, 15033, 15037, 15039, 