## Target: OrangePi/BearPi enabled collection
## Function: 
1. matches time to parsed .txt files before (not including) 2024.2.16, convert to list of dict
2. convert data before (not including) 2024.2.16 to merged_data.csv
3. parse .txt files after 2024.2.16 to list of dict
4. put all data together in merged_data.csv

In [71]:
import re
import io

# Read the log file
with io.open('datalog/data_collection_2024-02-02.log', 'r') as file:
    log_file_contents = file.read()

def parse_log_file(log_file_contents):
    # Regular expression to match lines with serial data
    serial_data_pattern = re.compile(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}):INFO:Serial Data: (.*)")

    # List to hold parsed data
    parsed_data = []

    # Iterate over each line in the log file
    for line in log_file_contents.splitlines():
        # Search for serial data pattern
        match = serial_data_pattern.search(line)
        if match:
            # Extract timestamp and serial data
            timestamp = match.group(1)
            serial_data = match.group(2)
            # Store the extracted data
            parsed_data.append((timestamp, serial_data))

    return parsed_data

# Parse the log file
parsed_log_data = parse_log_file(log_file_contents)
parsed_log_data


[('2024-02-02 22:32:27,086', '0,141'),
 ('2024-02-02 23:32:28,506', '0,142'),
 ('2024-02-02 23:41:42,894', '0,2435'),
 ('2024-02-02 23:42:00,886', '0,2439'),
 ('2024-02-02 23:42:16,165', '0,2441'),
 ('2024-02-02 23:42:27,387', '0,2446'),
 ('2024-02-03 00:46:07,513', '0,4065'),
 ('2024-02-03 01:46:11,485', '0,5695'),
 ('2024-02-03 02:46:13,813', '0,7402'),
 ('2024-02-03 03:46:16,779', '0,9148'),
 ('2024-02-03 04:46:18,015', '0,10963'),
 ('2024-02-03 05:46:20,350', '0,12594'),
 ('2024-02-03 06:46:22,984', '0,14148'),
 ('2024-02-03 07:46:26,753', '0,15502'),
 ('2024-02-03 08:46:28,426', '0,16803'),
 ('2024-02-03 09:46:30,164', '0,17994'),
 ('2024-02-03 10:46:34,229', '0,19062'),
 ('2024-02-03 11:46:44,527', '0,20127'),
 ('2024-02-03 12:46:47,858', '0,21209'),
 ('2024-02-03 13:46:49,180', '0,22339'),
 ('2024-02-03 14:46:54,581', '0,23542'),
 ('2024-02-03 15:46:57,215', '0,24710'),
 ('2024-02-03 16:47:00,080', '0,25905'),
 ('2024-02-03 17:47:01,709', '0,27092'),
 ('2024-02-03 18:47:06,257',

In [72]:
import os
import io

def parse_environmental_data_file(directory):
    # use os.listdir to get all files in the directory
    files = os.listdir(directory)    

    # List to hold parsed environmental data
    parsed_environmental_data = []

    # loop through all files
    for file in files:
        # construct full file path
        file_path = os.path.join(directory, file)

        # open, read, and parse the file
        with io.open(file_path, 'r') as file:
            collected_data_str = file.read()
            parsed_data = parse_environmental_data(collected_data_str)
            parsed_environmental_data.extend(parsed_data)

    return parsed_environmental_data


def parse_environmental_data(env_data_str):
    # Split the string into blocks of data for each serial entry
    data_blocks = env_data_str.strip().split("Serial Data: ")[1:]

    # List to hold parsed environmental data
    parsed_env_data = []

    # Iterate over each block and extract the information
    for block in data_blocks:
        lines = block.strip().split("\n")
        serial_data = lines[0].strip()
        city_name = lines[1].split(":")[1].strip()
        location_id = lines[2].split(":")[1].strip()
        temperature = lines[3].split(":")[1].strip()
        weather = lines[4].split(":")[1].strip()
        try:
            aqi = int(lines[5].split(":")[1].strip())
        except ValueError:
            aqi = None
        primary_pollutant = lines[6].split(":")[1].strip()

        parsed_env_data.append({
            "serial": serial_data,
            "city": city_name,
            "location_id": location_id,
            "temperature": temperature,
            "weather": weather,
            "aqi": aqi,
            "pollutant": primary_pollutant
        })

    return parsed_env_data


# Usage example:
directory = 'raw_data/before 2.15(inclusive)'
parsed_environmental_data = parse_environmental_data_file(directory)
parsed_environmental_data


[{'serial': '2,13796',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-1°C',
  'weather': '晴',
  'aqi': 24,
  'pollutant': 'NA'},
 {'serial': '2,15384',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-1°C',
  'weather': '晴',
  'aqi': 23,
  'pollutant': 'NA'},
 {'serial': '2,17305',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '晴',
  'aqi': 23,
  'pollutant': 'NA'},
 {'serial': '2,19231',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '晴',
  'aqi': 23,
  'pollutant': 'NA'},
 {'serial': '2,21141',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '晴',
  'aqi': 23,
  'pollutant': 'NA'},
 {'serial': '2,22920',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '晴',
  'aqi': 23,
  'pollutant': 'NA'},
 {'serial': '2,25079',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '晴',
  'aqi

In [73]:
# Function to match and merge data
def match_and_merge_data(parsed_data, environmental_data):
    # Create a dictionary for quick lookup of environmental data by serial number
    env_data_by_serial = {data["serial"]: data for data in environmental_data}

    # List to hold merged data
    merged_data = []

    # Iterate over parsed data and merge with environmental data
    for timestamp, serial in parsed_data:
        if serial in env_data_by_serial:
            # Combine data
            combined_data = {
                "timestamp": timestamp,
                "serial": serial,
                **env_data_by_serial[serial]
            }
            merged_data.append(combined_data)

    return merged_data

# Perform the matching and merging
merged_data = match_and_merge_data(parsed_log_data, parsed_environmental_data)
merged_data



[{'timestamp': '2024-02-02 22:32:27,086',
  'serial': '0,141',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-5°C',
  'weather': '多云',
  'aqi': 95,
  'pollutant': 'PM2.5'},
 {'timestamp': '2024-02-02 23:32:28,506',
  'serial': '0,142',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-4°C',
  'weather': '多云',
  'aqi': 87,
  'pollutant': 'PM2.5'},
 {'timestamp': '2024-02-02 23:41:42,894',
  'serial': '0,2435',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-4°C',
  'weather': '多云',
  'aqi': 75,
  'pollutant': 'PM2.5'},
 {'timestamp': '2024-02-02 23:42:00,886',
  'serial': '0,2439',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-4°C',
  'weather': '多云',
  'aqi': 75,
  'pollutant': 'PM2.5'},
 {'timestamp': '2024-02-02 23:42:16,165',
  'serial': '0,2441',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-4°C',
  'weather': '多云',
  'aqi': 75,
  'pollutant': 'PM2.5'},
 {'timestamp': '2024-02-02 23:42:27,387',


In [74]:
import pandas as pd

# Create a DataFrame from the merged data
df = pd.DataFrame(merged_data)

# Save the DataFrame to a CSV file
df.to_csv('merged_data.csv', index=False)


In [75]:
from datetime import datetime

# Define the directory to scan for files
directory_path = 'raw_data/after 2.15'


# Function to parse the provided data structure within a file
def parse_data_structure(file_content):
    data_pattern = re.compile(
        r'Serial Data: (?P<serial>.+?)\n'
        r'Collect Time: (?P<timestamp>.+?)\n'
        r'City Name: (?P<city>.+?)\n'
        r'Location ID: (?P<location_id>.+?)\n'
        r'Temperature: (?P<temperature>.+?)\n'
        r'Weather: (?P<weather>.+?)\n'
        r'AQI: (?P<aqi>.+?)\n'
        r'Primary Pollutant: (?P<pollutant>.+?)\n',
        re.DOTALL
    )

    return [match.groupdict() for match in data_pattern.finditer(file_content)]

# Function to parse their contents
# Function to parse the contents of files in a directory
def parse_files(directory):
    # List to hold the parsed data
    parsed_data = []

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                parsed_data.extend(parse_data_structure(content))

    return parsed_data

file_content = parse_files(directory_path)
file_content


[{'serial': '1,51657',
  'timestamp': '2024-02-20 00:12:06.480656',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-1°C',
  'weather': '阴',
  'aqi': '22',
  'pollutant': 'NA'},
 {'serial': '1,52666',
  'timestamp': '2024-02-20 01:12:08.777382',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-1°C',
  'weather': '多云',
  'aqi': '22',
  'pollutant': 'NA'},
 {'serial': '1,53602',
  'timestamp': '2024-02-20 02:12:25.365690',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '阴',
  'aqi': '22',
  'pollutant': 'NA'},
 {'serial': '1,54629',
  'timestamp': '2024-02-20 03:12:27.299127',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '多云',
  'aqi': '22',
  'pollutant': 'NA'},
 {'serial': '1,55644',
  'timestamp': '2024-02-20 04:12:28.871388',
  'city': '海淀',
  'location_id': '101010200',
  'temperature': '-2°C',
  'weather': '阴',
  'aqi': '22',
  'pollutant': 'NA'},
 {'serial': '1,56654',
  'ti

In [76]:
# Convert to csv
df = pd.DataFrame(file_content)
# df.to_csv('parsed_data.csv', index=False)

# Exchange pos of 2nd col(timestamp) & 1st col(serial)
df = df[['timestamp', 'serial', 'city', 'location_id', 'temperature', 'weather', 'aqi', 'pollutant']]
# df.to_csv('parsed_data.csv', index=False)

# Save in an existing csv file, but match the headers
df.to_csv('merged_data.csv', mode='a', header=False, index=False)