In [1]:
import json
import os
import copy

### 自己設定的 json 排版

In [2]:
class CustomEncoder(json.JSONEncoder):
    def encode(self, o):
        if isinstance(o, dict):
            return '{\n  ' + ', \n  '.join(f'{json.dumps(k)}:{json.dumps(v)}' for k, v in o.items()) + '\n}'
        return super().encode(o)

### 處理遺失資料的兩個函式

`missingFixing` 用來處理一般情形。

`missingFixingTailed` 用來處理特殊情形。

所謂特殊情形是指遺失資料一直延續到 23:59，這樣導致無法在單一檔案內進行資料修補，需要找下一天的資料進行修補

>但為了加快處理速度，所以我處理一般情形用 multiprocessing 來做，因此額外寫了 `data_cooking.py` 來執行

>因為 jupyter notebook 跑 multiprocessing 會有 bug

> 10/15 跟 10/11 遺失太多，所以我直接刪除

### Bug Fixed

dictionary 物件的賦值，是 shallow copy，所以 a = b 這樣的寫法，如果 b 被修改了， a 也會被修改 = =

所以要用 `copy.deepcopy`

In [3]:
# dealing with missing data
# but 10/15 lost a lot of data, so I just delete data of that day
def missingFixing(path:str):
    with open(path,'r') as F:
        data = json.load(fp=F)
        keys = list(data.keys())
        length = len(keys)
        # index from -1 to -length
        prev_data = {}
        for index in range(1,length+1):
            if data[keys[-index]] == {}:
                if prev_data != {}:
                    data[keys[-index]] = copy.deepcopy(prev_data)
                    data[keys[-index]]["Loss"] = 1
                else:
                    continue
            else:
                prev_data = copy.deepcopy(data[keys[-index]])
                if "Loss" not in data[keys[-index]].keys():
                    data[keys[-index]]["Loss"] = 0
        # if corrected > 0:
    with open(path,'w') as F:
        F.writelines(json.dumps(data,cls=CustomEncoder))

In [4]:
def calculateNextday(day:str):
    # extract month and date from current day
    month = int(day[-4:-2]) 
    date = int(day[-2:])
    month_31 = [1,3,5,7,8,10,12]
    if month in month_31:
        if date != 31:
            next_date = date + 1
            next_month = month
        else:
            next_date = 1
            if month != 12:
                next_month = month + 1
            else:
                next_month = 1
    else:
        if date != 30:
            next_date = date + 1
            next_month = month
        else:
            next_date = 1
            next_month = month + 1
    return next_date,next_month

In [9]:
# dealing with missing data that has no value from the end
def missingFixingTailed(day,sta):
    if sta[-5:] != ".json":
        sta = sta + ".json"
    path = f"./release/{day}/{sta}"
    corrected = 0
    with open(path,'r') as F:
        data = json.load(fp=F)
        keys = list(data.keys())
        # if list(data.values())[-1] == {}:
        if list(data.values())[-1] == {}:
            corrected = 1
            
            next_date,next_month = calculateNextday(day)
            next_path = f"./release/2023{next_month:02d}{next_date:02d}/{sta}" # data (path) of the day after current date
            
            while True: # some station doesn't have data in the next day
                try:
                    with open(next_path,'r') as next_F:
                        next_data = json.load(fp=next_F)
                        next_keys = list(next_data.keys())
                    break
                except FileNotFoundError:
                    next_date,next_month = calculateNextday(f"{next_month:02d}{next_date:02d}")
                    next_path = f"./release/2023{next_month:02d}{next_date:02d}/{sta}" # data (path) of the day after current date
                    continue
            
            data[keys[-1]] = copy.deepcopy(next_data[next_keys[0]])
            data[keys[-1]]["Loss"] = 1
            for i in range(2,1440):
                # i from -2 to -1440
                if data[keys[-i]] == {}:
                    data[keys[-i]] = data[keys[-i + 1]]
                    # here, whether they are same memory data doesn't matter
                else:
                    break
    if corrected == 1:
        with open(path,'w') as F:
            F.writelines(json.dumps(data,cls=CustomEncoder))

##### 普通情況的處理程序

In [6]:
many_days = os.listdir('./release')

In [7]:
many_days[-4:]

['20231222', '20231223', '20231224', '20231225']

In [10]:
for day in many_days[-18:]:
    print(f"now:{day}",end='\r')
    stations = os.listdir(f"./release/{day}")
    for sta in stations:
        missingFixing(f"./release/{day}/{sta}")

now:20231225

##### 處理單一天的特殊遺失情形

In [10]:
date = f"20231223"
stations = os.listdir(f"./release/{date}")
for sta in stations:
    missingFixingTailed(day=date,sta=sta)    

##### 處理 12 月的 1 號到 9 號的特殊遺失情形

In [13]:
for day in range(8,26):
    date = f"202312{day:02d}"
    stations = os.listdir(f"./release/{date}")
    for sta in stations:
        missingFixingTailed(day=date,sta=sta)  

##### 檢查是否有最後幾筆是遺失的特別情形

In [38]:
many_days = os.listdir('./release')
for day in many_days[:-1]:
    stations = os.listdir(f"./release/{day}")
    for sta in stations:
        path = f"./release/{day}/{sta}"
        with open(path,'r') as F :
            data = json.load(fp=F)
            value = list(data.values())
            if value[-1] == {}:
                print(f"happen at {day} {sta}")

In [37]:
many_days = os.listdir('./release')
for day in many_days:
    path = f"./release/{day}/{500101181}.json"
    with open(path,'r') as F :
        data = json.load(fp=F)
        value = list(data.values())
        if value[-1] == {}:
            print(f"happen at {day}")
        

happen at 20231210
