In [1]:
import pandas as pd
from sklearn.cluster import KMeans

# Объединение по средним значениям за каждый день

In [2]:
df1 = pd.read_csv('../data/processed/cleaned_air_data_with_aqi_and_coordinates.csv')
df2 = pd.read_csv('../data/processed/cleaned_weather_data.csv')

In [3]:
df1.date = pd.to_datetime(df1.date).dt.date

In [4]:
df2['date'] = pd.to_datetime(df2.time).dt.date

In [5]:
date = df2['date']
df2.drop(labels=['date'], axis=1,inplace = True)
df2.insert(0, 'date', date)

In [6]:
df1 = df1.drop(['station_name', 'latitude', 'longitude'], axis=1)

In [7]:
df2 = df2.drop('time', axis=1)

In [8]:
df1_mean = df1.groupby('date').mean()

In [9]:
df2_mean = df2.groupby('date').mean()

In [10]:
merged = pd.merge(df1_mean, df2_mean, on='date').reset_index()

In [11]:
df1_statistics = pd.merge(pd.merge(df1.groupby('date').min().reset_index(), df1.groupby('date').max().reset_index(), on='date', suffixes=['_min', '_max']), df1.groupby('date').mean().reset_index(), on='date')
columns = df1_statistics.columns.tolist()
columns[-6:] = [x + '_mean' for x in columns[-6:]]
df1_statistics.columns = columns

In [12]:
df2_statistics = pd.merge(pd.merge(df2.groupby('date').min().reset_index(), df2.groupby('date').max().reset_index(), on='date', suffixes=['_min', '_max']), df2.groupby('date').mean().reset_index(), on='date')
columns = df2_statistics.columns.tolist()
columns[-9:] = [x + '_mean' for x in columns[-9:]]
df2_statistics.columns = columns

In [13]:
merged_full_statistics = pd.merge(df1_statistics, df2_statistics, on='date').reset_index()

In [14]:
merged.to_csv('../data/extracted/merged_by_date.csv', index=False)

In [15]:
merged_full_statistics.to_csv('../data/extracted/merged_by_date_full.csv', index=False)

# Класстеризация по расстоянию до центра

In [16]:
df1 = pd.read_csv('../data/processed/cleaned_air_data_with_aqi_and_coordinates.csv')
df2 = pd.read_csv('../data/processed/cleaned_weather_data.csv')

In [17]:
moscow_center = [55.7522, 37.6156]

In [18]:
df1['distance_to_center'] = (abs(df1.iloc[:, -2:] - moscow_center)).sum(axis=1)

In [19]:
df1.distance_to_center.min(), df1.distance_to_center.max()

(0.02716999999999814, 0.6907520000000034)

In [20]:
df1

Unnamed: 0,date,pm25,pm10,o3,no2,co,station_name,aqi,latitude,longitude,distance_to_center
0,2024-12-04,23.0,12.0,0.008000,0.014,1.0,"Mgu, Moscow, Moscow, Russia",73.92,55.707241,37.522410,0.138149
1,2024-12-04,13.0,12.0,0.013267,0.017,2.0,"Novokosino, Moscow, Moscow, Russia",52.89,55.734445,37.857779,0.259934
2,2024-12-04,26.0,22.0,0.006000,0.019,4.0,"Zelen_15, Moscow, Moscow, Russia",80.23,55.976936,37.149584,0.690752
3,2024-12-04,12.0,11.0,0.017667,0.021,2.0,"Narod_op, Moscow, Moscow, Russia",50.00,55.776064,37.475878,0.163586
4,2024-12-04,7.0,46.0,0.019411,0.025,3.0,"Suhar, Moscow, Moscow, Russia",42.59,55.773757,37.627445,0.033402
...,...,...,...,...,...,...,...,...,...,...,...
1786,2024-06-06,39.0,16.0,0.014315,0.017,1.0,"Veshnyaki, Moscow, Moscow, Russia",109.62,55.719959,37.795549,0.212190
1787,2024-06-06,54.0,29.0,0.026000,0.012,1.0,"bazovskaya, Moscow, Moscow, Russia",146.55,55.877509,37.508548,0.232361
1788,2024-06-06,46.0,23.0,0.013267,0.020,2.0,"Novokosino, Moscow, Moscow, Russia",126.85,55.734445,37.857779,0.259934
1789,2024-06-06,31.0,11.0,0.020000,0.016,1.0,"Shabol, Moscow, Moscow, Russia",90.75,55.715698,37.605238,0.046864


In [21]:
model = KMeans(n_clusters=12)

In [22]:
model.fit(df1.iloc[:, -3:].values)

In [23]:
clusters = model.predict(df1.iloc[:, -3:].values)

In [24]:
df1['cluster'] = clusters

In [25]:
df1.to_csv('../data/extracted/df1_with_clusters.csv', index=False)

In [26]:
merged['test'] = merged.pm25/merged.pm10

In [27]:
merged.iloc[:, 1:].corr().style.background_gradient(cmap='coolwarm')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,pm25,pm10,o3,no2,co,aqi,temp,dwpt,rhum,prcp,snow,wdir,wspd,pres,coco,test
pm25,1.0,0.647347,0.244876,0.491791,0.464284,0.987029,0.493813,0.41562,-0.394532,,,-0.536992,-0.553995,0.249098,-0.398848,0.12366
pm10,0.647347,1.0,0.353119,0.77134,0.687541,0.637912,0.420491,0.239548,-0.630884,,,-0.483563,-0.417718,0.420864,-0.653168,-0.564791
o3,0.244876,0.353119,1.0,0.197536,0.244068,0.260634,0.621833,0.525316,-0.513196,,,-0.155618,-0.228373,0.02948,-0.448866,-0.281269
no2,0.491791,0.77134,0.197536,1.0,0.764142,0.466707,0.143237,-0.007369,-0.427353,,,-0.419184,-0.458655,0.506232,-0.548589,-0.411131
co,0.464284,0.687541,0.244068,0.764142,1.0,0.444221,0.219672,0.144366,-0.268667,,,-0.323536,-0.423269,0.360887,-0.477781,-0.345705
aqi,0.987029,0.637912,0.260634,0.466707,0.444221,1.0,0.534521,0.458762,-0.408911,,,-0.532676,-0.551862,0.220869,-0.399121,0.134514
temp,0.493813,0.420491,0.621833,0.143237,0.219672,0.534521,1.0,0.945698,-0.584224,,,-0.288115,-0.335814,-0.213604,-0.451203,-0.185166
dwpt,0.41562,0.239548,0.525316,-0.007369,0.144366,0.458762,0.945698,1.0,-0.291548,,,-0.213018,-0.322979,-0.341245,-0.232065,-0.048536
rhum,-0.394532,-0.630884,-0.513196,-0.427353,-0.268667,-0.408911,-0.584224,-0.291548,1.0,,,0.293669,0.145971,-0.217002,0.752246,0.446633
prcp,,,,,,,,,,,,,,,,
