In [6]:
import pandas as pd
import re

def process_environmental_data(text):
    # Initialize lists to store the data
    measurements = []
    
    # Regular expressions for all measurements in one group
    pattern = (
        r"Humidity out: (\d+\.\d+) %\s*"
        r"Temperature out: (\d+\.\d+) \*C\s*"
        r"Humidity IN: (\d+\.\d+) %\s*"
        r"Temperature IN: (\d+\.\d+) \*C\s*"
        r"CO2: (\d+\.\d+)\s+ppm"
    )
    
    # Find all matches
    matches = re.finditer(pattern, text)
    
    # Process each complete set of measurements
    for match in matches:
        measurements.append({
            'Humidity_Out': float(match.group(1)),
            'Temperature_Out': float(match.group(2)),
            'Humidity_In': float(match.group(3)),
            'Temperature_In': float(match.group(4)),
            'CO2': float(match.group(5))
        })
    
    # Create DataFrame from the list of dictionaries
    df = pd.DataFrame(measurements)
    
    return df

# Try different encodings to read the file
try:
    with open('data.txt', 'r', encoding='utf-8') as file:
        data_text = file.read()
except UnicodeDecodeError:
    try:
        with open('data.txt', 'r', encoding='latin-1') as file:
            data_text = file.read()
    except UnicodeDecodeError:
        with open('data.txt', 'r', encoding='cp1252') as file:
            data_text = file.read()

# Process the data
df = process_environmental_data(data_text)

# Display results
print("Data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())


Data shape: (667, 5)

First few rows:
   Humidity_Out  Temperature_Out  Humidity_In  Temperature_In      CO2
0          58.0             24.8         53.8            26.1  1081.16
1          58.0             24.8         53.8            26.1  1081.16
2          58.0             24.8         53.7            26.1  1081.16
3          58.0             24.8         53.7            26.1  1081.16
4          58.0             24.8         53.7            26.1  1081.16

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Humidity_Out     667 non-null    float64
 1   Temperature_Out  667 non-null    float64
 2   Humidity_In      667 non-null    float64
 3   Temperature_In   667 non-null    float64
 4   CO2              667 non-null    float64
dtypes: float64(5)
memory usage: 26.2 KB
None

Basic Statistics:
       Humidity_Out  Tempera

Unnamed: 0,Humidity_Out,Temperature_Out,Humidity_In,Temperature_In,CO2
0,58.0,24.8,53.8,26.1,1081.16
1,58.0,24.8,53.8,26.1,1081.16
2,58.0,24.8,53.7,26.1,1081.16
3,58.0,24.8,53.7,26.1,1081.16
4,58.0,24.8,53.7,26.1,1081.16
...,...,...,...,...,...
662,58.0,24.8,53.7,26.2,894.65
663,58.0,24.8,53.7,26.2,894.65
664,58.0,24.8,53.7,26.2,894.65
665,58.0,24.8,53.7,26.2,894.65
