In [256]:
import pandas as pd

In [257]:
# Чтение данных и удаление первой строки с единицами измерения
df = pd.read_csv('weather.csv', na_values="", parse_dates=['Datetime']).drop(index=0, axis=1)

# Названия колонок к PEP8
df.columns = df.columns.str.replace(' ', '_').str.lower()

# Приведение колонок к float, кроме колонки с датами
for col in df.columns[1:]:
    df[col] = df[col].astype(float)

#### Проверка на дубликаты

In [258]:
df.duplicated().sum(), df['datetime'].duplicated().sum()

(np.int64(0), np.int64(0))

#### Работа с пропусками

In [259]:
df.isna().sum() # Проверка наличия пропусков

datetime                0
temperature            55
precipitation_total    30
wind_gust               0
wind_speed             16
cloud_cover_total      20
sunshine_duration      12
dtype: int64

Так как мы работаем с временными рядами, в заполнении пропусков может помочь **интерполяция**, которая отталкивается от значений соседних точек.

##### ДО

In [260]:
df.iloc[1003:1015]

Unnamed: 0,datetime,temperature,precipitation_total,wind_gust,wind_speed,cloud_cover_total,sunshine_duration
1004,2023-05-12 19:00:00,14.070246,0.1,18.72,6.130579,28.0,42.074078
1005,2023-05-12 20:00:00,13.690246,0.0,14.4,5.506941,26.0,0.0
1006,2023-05-12 21:00:00,,0.0,12.959999,6.569383,20.0,0.0
1007,2023-05-12 22:00:00,,0.0,14.04,7.24486,38.0,0.0
1008,2023-05-12 23:00:00,,0.0,14.04,6.618519,38.0,0.0
1009,2023-05-13 00:00:00,,0.0,13.679999,7.729527,25.0,0.0
1010,2023-05-13 01:00:00,,0.0,12.959999,7.172949,23.7,0.0
1011,2023-05-13 02:00:00,,0.0,12.599999,8.557102,23.1,0.0
1012,2023-05-13 03:00:00,,0.0,11.159999,6.725354,59.0,0.0
1013,2023-05-13 04:00:00,,0.0,10.799999,6.638072,77.0,1.440741


##### После

In [261]:
for col in df.columns[1:]:
    df[col] = df[col].interpolate()
    
df.iloc[1003:1015]

Unnamed: 0,datetime,temperature,precipitation_total,wind_gust,wind_speed,cloud_cover_total,sunshine_duration
1004,2023-05-12 19:00:00,14.070246,0.1,18.72,6.130579,28.0,42.074078
1005,2023-05-12 20:00:00,13.690246,0.0,14.4,5.506941,26.0,0.0
1006,2023-05-12 21:00:00,13.488024,0.0,12.959999,6.569383,20.0,0.0
1007,2023-05-12 22:00:00,13.285801,0.0,14.04,7.24486,38.0,0.0
1008,2023-05-12 23:00:00,13.083579,0.0,14.04,6.618519,38.0,0.0
1009,2023-05-13 00:00:00,12.881357,0.0,13.679999,7.729527,25.0,0.0
1010,2023-05-13 01:00:00,12.679134,0.0,12.959999,7.172949,23.7,0.0
1011,2023-05-13 02:00:00,12.476912,0.0,12.599999,8.557102,23.1,0.0
1012,2023-05-13 03:00:00,12.27469,0.0,11.159999,6.725354,59.0,0.0
1013,2023-05-13 04:00:00,12.072467,0.0,10.799999,6.638072,77.0,1.440741


#### Работа со значениями

In [262]:
# Перевод порыва и скорости ветра в м/с и округление до 1 знака после запятой
df[['wind_gust', 'wind_speed']] = (df[['wind_gust', 'wind_speed']] / 3.6).round(1)

# Округление продолжительности солнечного сияния, температуры и облачности до целого
df[['sunshine_duration', 'temperature', 'cloud_cover_total']] = df[['sunshine_duration', 'temperature', 'cloud_cover_total']].round(0)

df.iloc[1234:1240]

Unnamed: 0,datetime,temperature,precipitation_total,wind_gust,wind_speed,cloud_cover_total,sunshine_duration
1235,2023-05-22 10:00:00,23.0,0.0,4.6,0.9,8.0,55.0
1236,2023-05-22 11:00:00,24.0,0.0,5.1,1.1,4.0,58.0
1237,2023-05-22 12:00:00,25.0,0.0,5.0,1.4,4.0,58.0
1238,2023-05-22 13:00:00,25.0,0.1,5.0,1.9,9.0,55.0
1239,2023-05-22 14:00:00,26.0,0.6,6.0,1.0,18.0,49.0
1240,2023-05-22 15:00:00,25.0,0.5,6.4,1.1,19.0,48.0


#### Проверка на выбросы

In [263]:
df.describe()

Unnamed: 0,datetime,temperature,precipitation_total,wind_gust,wind_speed,cloud_cover_total,sunshine_duration
count,3672,3672.0,3672.0,3672.0,3672.0,3672.0,3672.0
mean,2023-06-16 11:30:00.000000256,18.087691,0.100218,5.801825,2.039434,40.980664,21.543573
min,2023-04-01 00:00:00,-2.0,0.0,0.8,0.0,0.0,0.0
25%,2023-05-09 05:45:00,13.0,0.0,3.7,1.2,15.0,0.0
50%,2023-06-16 11:30:00,18.0,0.0,5.1,1.7,32.0,15.0
75%,2023-07-24 17:15:00,23.0,0.0,7.3,2.6,67.0,43.0
max,2023-08-31 23:00:00,36.0,5.8,19.8,8.3,100.0,60.0
std,,6.838842,0.342384,2.899036,1.184241,31.313242,21.890226


Все значения находятся в норме, выбросов не наблюдается

#### Выгрузка новой таблицы

In [None]:
new_units_of_measure = ["", "°C", "mm", "m/s", "m/s", "%", "min"] # Скорости теперь в м / с

first_row = pd.DataFrame([new_units_of_measure], columns=df.columns, index=[0]) # Добавление в начало таблицы

df = pd.concat([first_row, df], ignore_index=True)

df.to_csv('processed_weather.csv', index=False)