In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.cluster import KMeans
import haversine as hs
from haversine import Unit
import numpy as np
import pickle


In [None]:
input_data_folder_base = Path("../results/")
input_data_folder = Path.cwd() / input_data_folder_base
overall_collection_file_name = "overall_collection_2023.p"
key_parameter_file_name = "key_parameter_2023.p"
cluster_positions_file_name = "cluster_positions_2023.p"
print(f"Reading raw pandas files from \n    {input_data_folder}")

output_data_folder = input_data_folder
print(f"Writing merged pandas file to \n    {output_data_folder}")

In [None]:
df_results = pd.read_pickle(input_data_folder / overall_collection_file_name)
df_results.head(3)

In [None]:
df_results.dropna(inplace=True)

In [None]:
cluster_count = {
    "Balbina": 3,
    "Jandira":3,
    "Caldeirao": 6,
    "Iranduba": 4,
}

cluster_position = {
    "Balbina": np.nan,
    "Jandira": np.nan,
    "Caldeirao": np.nan,
    "Iranduba": np.nan,
}

def run_k_means(location_name, df_sep_pos, n_clusters):
    positions = df_sep_pos[['lat_est','lon_est']]
    kmeans = KMeans(n_clusters=n_clusters, n_init=10).fit(positions)
    centroids = kmeans.cluster_centers_
    return centroids

def dist(lat_0, lon_0, lat_1, lon_1):
    return hs.haversine((lat_0, lon_0), (lat_1, lon_1), unit=Unit.METERS)

df_results["position"] = np.nan
for entry in cluster_count:
    print(f"{entry} ...")
    print(f"   ... running cluster analysis")
    location_name = entry
    df_sep_pos = df_results[(df_results.experiment_location == location_name) &
                            (df_results.meas_running)]
    centroids = run_k_means(location_name, df_sep_pos,
                            cluster_count[location_name])
    print(centroids)
    cluster_position[entry] = centroids

    print(f"   ... calculating distances")
    for id_row, row in df_results.iterrows():
        if ((row.experiment_location == location_name) & (row.meas_running)):
            smallest_distance = np.inf
            id_of_smallest_dist = 100000
            for id_pos, position in enumerate(centroids):
                distance = dist(row['lat_est'], row['lon_est'], position[0], position[1])
                if distance < smallest_distance:
                    id_of_smallest_dist = id_pos
                    smallest_distance = distance
            df_results.at[id_row, "position"] = id_of_smallest_dist

In [None]:
df_results.loc[df_results["corresponding_meas_file"] == "040323-151128-ADC.p", "position"] = 3.0
aux = np.append(cluster_position['Jandira'],
          [[df_results[df_results["corresponding_meas_file"] == "040323-151128-ADC.p"].lat_est[0],
           df_results[df_results["corresponding_meas_file"] == "040323-151128-ADC.p"].lon_est[0]]],
           axis=0)

cluster_position['Jandira'] = aux
cluster_position
           

In [None]:
result_file = output_data_folder / Path(overall_collection_file_name)
print(f"... saving common results to {  Path(overall_collection_file_name)}")
df_results.to_pickle(result_file)

In [None]:
result_file = output_data_folder / Path(cluster_positions_file_name)
with open(result_file, 'wb') as f:
    pickle.dump(cluster_position, f)