In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

data = pd.read_csv(
    "data/data_co2/may/50000301_02.05.2023 00.00.00.csv",
    encoding="utf-8",
    sep=";",
    skiprows=lambda x: x < 5,
)
data = data[["Time", "1", "14"]]
data["Time"] = pd.to_datetime(data["Time"], errors="coerce")
data = data[(data["1"] != 0) & ~data["14"].isin([-100, 200, float("nan")])]
data.to_csv("data/data_co2/filtered_data.csv", index=False)

data


Unnamed: 0,Time,1,14
30,2023-02-05 00:00:31,1.0,437.744
31,2023-02-05 00:00:32,1.0,437.744
32,2023-02-05 00:00:33,1.0,441.808
33,2023-02-05 00:00:34,1.0,439.776
34,2023-02-05 00:00:35,1.0,445.872
...,...,...,...
35274,2023-02-05 23:24:24,8.0,512.928
35275,2023-02-05 23:24:25,8.0,514.960
35276,2023-02-05 23:24:26,8.0,512.928
35277,2023-02-05 23:24:27,8.0,512.928


In [27]:
first_file = pd.read_csv("data/data_co2/filtered_data.csv")
second_file = pd.read_csv(
    "data/data_co2/meteoMay/60000301_2023.05.02_07.05.00.csv", sep=";", parse_dates=["Time"]
)

first_file["Time"] = pd.to_datetime(first_file["Time"])
second_file["Time"] = pd.to_datetime(second_file["Time"], dayfirst=True)

result = pd.DataFrame(
    columns=[
        "Time",
        "1",
        "14",
        "700",
        "4001",
        "4002",
        "4003",
        "4004",
        "4005",
        "4006",
        "4007",
        "4008",
    ]
)

closest_time_idxs = np.abs(second_file["Time"].values[:, None] - first_file["Time"].values).argmin(
    axis=0
)

closest_values = second_file.iloc[closest_time_idxs][
    ["Time", "700", "4001", "4002", "4003", "4004", "4005", "4006", "4007", "4008"]
]
result = pd.concat([first_file, closest_values.reset_index(drop=True)], axis=1)

result.to_csv("data/data_co2/combined_data.csv", index=False)
result

Unnamed: 0,Time,1,14,Time.1,700,4001,4002,4003,4004,4005,4006,4007,4008
0,2023-02-05 00:00:31,1.0,437.744,2023-02-05 07:05:00,10107,-100,11187,957,11469,11351,10953,11061,11068
1,2023-02-05 00:00:32,1.0,437.744,2023-02-05 07:05:00,10107,-100,11187,957,11469,11351,10953,11061,11068
2,2023-02-05 00:00:33,1.0,441.808,2023-02-05 07:05:00,10107,-100,11187,957,11469,11351,10953,11061,11068
3,2023-02-05 00:00:34,1.0,439.776,2023-02-05 07:05:00,10107,-100,11187,957,11469,11351,10953,11061,11068
4,2023-02-05 00:00:35,1.0,445.872,2023-02-05 07:05:00,10107,-100,11187,957,11469,11351,10953,11061,11068
...,...,...,...,...,...,...,...,...,...,...,...,...,...
34554,2023-02-05 23:24:24,8.0,512.928,2023-02-05 23:25:00,102034,-4361,-4351,-4551,-3913,-4154,-4233,-3999,-4422
34555,2023-02-05 23:24:25,8.0,514.960,2023-02-05 23:25:00,102034,-4361,-4351,-4551,-3913,-4154,-4233,-3999,-4422
34556,2023-02-05 23:24:26,8.0,512.928,2023-02-05 23:25:00,102034,-4361,-4351,-4551,-3913,-4154,-4233,-3999,-4422
34557,2023-02-05 23:24:27,8.0,512.928,2023-02-05 23:25:00,102034,-4361,-4351,-4551,-3913,-4154,-4233,-3999,-4422


In [28]:
# Загрузка данных
data = pd.read_csv("data/data_co2/combined_data.csv")

# Разбиение данных на трехминутные интервалы
interval = pd.Timedelta(minutes=3)
data["Time"] = pd.to_datetime(data["Time"])
data["Interval"] = (data["Time"] - data["Time"].min()) // interval

# Группировка данных по интервалам
grouped = data.groupby("Interval")
for col in ["700", "4001", "4002", "4003", "4004", "4005", "4006", "4007", "4008"]:
    data[col] = data[col].str.replace(",", ".")

# Создание пустого DataFrame для результатов регрессии
regression_result = pd.DataFrame(
    columns=[
        "Interval",
        "700_slope",
        "4001_slope",
        "4002_slope",
        "4003_slope",
        "4004_slope",
        "4005_slope",
        "4006_slope",
        "4007_slope",
        "4008_slope",
    ]
)

# Проход по каждому трехминутному интервалу и применение линейной регрессии
for interval, group in grouped:
    x = np.arange(len(group)).reshape(-1, 1)
    for col in ["700", "4001", "4002", "4003", "4004", "4005", "4006", "4007", "4008"]:
        y = group[col].values.reshape(-1, 1)
        model = LinearRegression().fit(x, y)
        slope = model.coef_[0][0]
        new_row = pd.DataFrame({"Interval": [interval], f"{col}_slope": [slope]})
        regression_result = pd.concat([regression_result, new_row], ignore_index=True)

# Сохранение результата в файл
regression_result.to_csv("data/data_co2/regression_result.csv", index=False)

regression_result

  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)
  regression_result = pd.concat([regression_result, new_row], ignore_index=True)


Unnamed: 0,Interval,700_slope,4001_slope,4002_slope,4003_slope,4004_slope,4005_slope,4006_slope,4007_slope,4008_slope
0,0,0.0,,,,,,,,
1,0,,0.0,,,,,,,
2,0,,,0.0,,,,,,
3,0,,,,0.0,,,,,
4,0,,,,,-5.657949e-34,,,,
...,...,...,...,...,...,...,...,...,...,...
1930,467,,,,,-4.745465e-04,,,,
1931,467,,,,,,0.000475,,,
1932,467,,,,,,,0.000475,,
1933,467,,,,,,,,0.001424,
