In [23]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error
import os

In [24]:
exp_folder =  "/Users/alexandra/Nextcloud-HTW/SHARED/SurfaceAI/data/mapillary_images/automated_labeling_experiments"
gpt_folder = os.path.join(exp_folder, "gpt_experiments")
embedding_folder = os.path.join(exp_folder, "embedding_experiments")
annotations_folder = os.path.join(exp_folder, "annotations_combined")

In [25]:
# combine all annotations of V101 and V200 into one file
# Specify the folder path containing the CSV files

for exp_id, output_folder in [["exp1", "V101.csv"], ["exp2", "V200.csv"]]:
        
    folder_paths = [os.path.join(gpt_folder, "annotations"), os.path.join(embedding_folder, "annotations")]

    csv_files = []
    # Get a list of all CSV files in the folder
    for folder_path in folder_paths:
        csv_files += [os.path.join(folder_path, file) for 
                      file in os.listdir(folder_path) if (file.endswith('.csv') & file.startswith(exp_id))]

    # Create an empty DataFrame to store the combined data
    combined_data = pd.DataFrame()

    # Iterate through each CSV file
    for file in csv_files:
        # Read the CSV file into a DataFrame
        file_path = os.path.join(folder_path, file)
        data = pd.read_csv(file_path)
        # Append the data to the combined DataFrame
        combined_data = pd.concat([combined_data, data], ignore_index=True)

        # TODO: append yes / no file for V101
        

    # Save the combined data to a new CSV file
    combined_data.drop_duplicates(inplace=True)
    combined_data.to_csv(os.path.join(exp_folder, "annotations_combined", output_folder), index=False)

In [35]:
def smoothness_to_int(smoothness):
    if smoothness == "very_bad":
        return 5
    elif smoothness == "bad":
        return 4
    elif smoothness == "intermediate":
        return 3
    elif smoothness == "good":
        return 2
    elif smoothness == "excellent":
        return 1
    else:
        return None
    
def exp_results(exp_id, annotations_folder, folder, class_selection, exp_type):
    ds_version = "101" if exp_id == 1 else "200"
    
    # annotataions file of entire dataset
    annot = pd.read_csv(f"{annotations_folder}/V{ds_version}.csv")
    annot.loc[annot.nostreet.notna(), "surface"] = "nostreet"
    annot.loc[annot.nostreet.notna(), "smoothness"] = "nostreet"
    annot["image_id"] = annot.image.apply(lambda x: str.split(x, "/")[-1]).apply(lambda x: int(str.split(x, ".jpg")[0]))

    pred_folder = os.path.join(folder, "results")
    df = pd.DataFrame()
    counts = pd.DataFrame()
    file_name_addition = ""
    
    for cl in class_selection:
        surface = cl[0]
        smoothness = cl[1]
        batch_id = None if len(cl) <= 2 else cl[2]
        if exp_id == 1:
            if "gpt" in folder:
                results_file = f"experiment_{exp_id}_V{ds_version}_{surface}_{smoothness}.csv"
            else:
                results_file = f"V101_effnet_{surface}_{smoothness}_gpt_combined.csv"

        elif exp_id == 2:
            file_name_addition = f"_batch_{batch_id}"
            results_file = f"experiment_{exp_id}_V{ds_version}_{surface}{file_name_addition}.csv"

        results = pd.read_csv(f"{pred_folder}/{results_file}")
        
        total_n = len(results)
        pred_annot_class = results[(results.preselection_type == surface) & (results.prediction == smoothness)]
        pred_n = len(pred_annot_class)
        pred_annot_class = pred_annot_class.set_index("image_id").join(annot[['image_id', 'surface', 'smoothness']].set_index("image_id"), how="left").reset_index()
        pred_annot_class["prediction_int"] = pred_annot_class.prediction.apply(smoothness_to_int)
        pred_annot_class["smoothness_int"] = pred_annot_class.smoothness.apply(smoothness_to_int)

        exclusions_n = len(pred_annot_class[(pred_annot_class.surface == "nostreet")])
        exclusions_perc = round(exclusions_n / pred_n*100, 2)
        tp = len(pred_annot_class[(pred_annot_class.preselection_type == pred_annot_class.surface) & 
                              (pred_annot_class.prediction == pred_annot_class.smoothness)])

        # compute MSE
        counts = pd.concat([counts, 
                            pd.Series([exp_type, exp_id, batch_id, surface, smoothness, total_n, pred_n, 
                                       exclusions_n, exclusions_perc, tp])], axis=1)

    counts = counts.T
    counts.columns = ["exp_type", "exp", "batch_id", "surface", "smoothness", "total_n", 
                      "gpt_pred_n", "exclusions (nostreet)", "excl. %", "tp"]
    
    # TODO. move into for-loop
    df = pd.concat([df, pred_annot_class], axis = 1)
    df_no_na = df[df.smoothness_int.notna() & df.prediction_int.notna()].copy()
    df_no_na["squared_error"] = abs(df_no_na.smoothness_int - df_no_na.prediction_int).apply(lambda x: x**2)
    rmse = df_no_na.groupby(["preselection_type", "prediction"]).apply(lambda x: root_mean_squared_error(x.smoothness_int, x.prediction_int)).astype(float).round(2)
    rmse.name = "rmse"
    rmse.index.names = ["surface", "smoothness"]
    #counts = counts.join(rmse)

    counts["tp/n_pred"] = (counts.tp / counts.gpt_pred_n * 100).astype(float).round(2)
    counts["gpt_precision(only valid)"] = (counts.tp / (counts.gpt_pred_n - counts["exclusions (nostreet)"]) * 100).astype(float).round(2)
    counts["hits"] = (counts.tp / counts.total_n * 100).astype(float).round(2)
    return (counts)

In [40]:
class_selection = ["paving_stones", "bad"], ["paving_stones", "intermediate"], ["asphalt", "bad"]
gpt_exp1 = exp_results(1, annotations_folder, gpt_folder, class_selection, exp_type="GPT-4o")
embedding_exp1 = exp_results(1, annotations_folder, embedding_folder, class_selection, exp_type="SimS")

class_selection = ["paving_stones", "bad", 1], ["paving_stones", "bad", 2], ["paving_stones", "intermediate", 1],["paving_stones", "intermediate", 2], ["asphalt", "bad", 1], ["asphalt", "bad", 2]
gpt_exp2 = exp_results(2, annotations_folder, gpt_folder, class_selection, exp_type="GPT-4o")

In [41]:
results = pd.concat([gpt_exp1, gpt_exp2, embedding_exp1], axis=0)
results

Unnamed: 0,exp_type,exp,batch_id,surface,smoothness,total_n,gpt_pred_n,exclusions (nostreet),excl. %,tp,tp/n_pred,gpt_precision(only valid),hits
0,GPT-4o,1,,paving_stones,bad,208,17,4,23.53,11,64.71,84.62,5.29
0,GPT-4o,1,,paving_stones,intermediate,1000,340,100,29.41,137,40.29,57.08,13.7
0,GPT-4o,1,,asphalt,bad,1014,185,45,24.32,73,39.46,52.14,7.2
0,GPT-4o,2,1.0,paving_stones,bad,712,54,48,88.89,2,3.7,33.33,0.28
0,GPT-4o,2,2.0,paving_stones,bad,712,11,8,72.73,2,18.18,66.67,0.28
0,GPT-4o,2,1.0,paving_stones,intermediate,712,162,63,38.89,46,28.4,46.46,6.46
0,GPT-4o,2,2.0,paving_stones,intermediate,712,175,52,29.71,54,30.86,43.9,7.58
0,GPT-4o,2,1.0,asphalt,bad,1998,55,39,70.91,5,9.09,31.25,0.25
0,GPT-4o,2,2.0,asphalt,bad,1995,33,14,42.42,7,21.21,36.84,0.35
0,SimS,1,,paving_stones,bad,208,204,74,36.27,21,10.29,16.15,10.1


In [118]:
results["class"] = results.surface + "-" + results.smoothness
paper_table = results[(results.batch_id.isna()) | (results.batch_id == 2)].pivot(
    index = ["class"],
    columns =  ["exp_type", "exp"],
    values = ["tp/n_pred", "hits"])

paper_table

Unnamed: 0_level_0,tp/n_pred,tp/n_pred,tp/n_pred,hits,hits,hits
exp_type,GPT-4o,GPT-4o,SimS,GPT-4o,GPT-4o,SimS
exp,1,2,1,1,2,1
class,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
asphalt-bad,39.46,21.21,20.38,7.2,0.35,4.24
paving_stones-bad,64.71,18.18,10.29,5.29,0.28,10.1
paving_stones-intermediate,40.29,30.86,30.74,13.7,7.58,7.9


In [120]:
results["class"] = results.surface + "-" + results.smoothness
paper_table = results[(results.batch_id.isna()) | (results.batch_id == 2)].pivot(
    index = ["class"],
    columns =  ["exp_type", "exp"],
    values = ["tp/n_pred", "hits"])
paper_table.columns = paper_table.columns.droplevel(0)
paper_table.columns = paper_table.columns.droplevel(1)
paper_table.reset_index(inplace=True)
#paper_table.drop("exp_type", inplace=True, axis=1)

paper_table = paper_table.iloc[:,[0,1,4,2,5,3,6]]
paper_table.columns=(["class", "OSMT+CM+GPT-4o", "", "OSMT+GPT-4o", "",
                      "SimS+CM+GPT-4o", ""
                      ])
paper_table

Unnamed: 0,class,OSMT+CM+GPT-4o,Unnamed: 3,OSMT+GPT-4o,Unnamed: 5,SimS+CM+GPT-4o,Unnamed: 7
0,asphalt-bad,39.46,7.2,21.21,0.35,20.38,4.24
1,paving_stones-bad,64.71,5.29,18.18,0.28,10.29,10.1
2,paving_stones-intermediate,40.29,13.7,30.86,7.58,30.74,7.9


In [122]:
print(paper_table.to_latex(float_format="%.2f", index=False))

\begin{tabular}{lrrrrrr}
\toprule
class & OSMT+CM+GPT-4o &  & OSMT+GPT-4o &  & SimS+CM+GPT-4o &  \\
\midrule
asphalt-bad & 39.46 & 7.20 & 21.21 & 0.35 & 20.38 & 4.24 \\
paving_stones-bad & 64.71 & 5.29 & 18.18 & 0.28 & 10.29 & 10.10 \\
paving_stones-intermediate & 40.29 & 13.70 & 30.86 & 7.58 & 30.74 & 7.90 \\
\bottomrule
\end{tabular}



## Compute OSMT vor v101

In [167]:
annot_v101 = pd.read_csv(f"{annotations_folder}/V101.csv")
annot_v101["image_id"] = annot_v101.image.apply(lambda x: str.split(x, "/")[-1]).apply(lambda x: int(str.split(x, ".jpg")[0]))
annot_v101.loc[annot_v101.nostreet.notna(), "surface"] = "nostreet"
annot_v101.loc[annot_v101.nostreet.notna(), "smoothness"] = "nostreet"

# TODO: add - yes / no

annot_v101 = annot_v101[["image_id", "surface", "smoothness"]]
osm_tag_v101 = pd.read_csv("/Users/alexandra/Nextcloud-HTW/SHARED/SurfaceAI/data/mapillary_images/training/V101/metadata/train_image_selection_metadata.csv")
osm_tag_v101 = osm_tag_v101[["id", "surface_clean", "smoothness_clean"]]

annot_v101 = annot_v101.set_index("image_id").join(osm_tag_v101.set_index("id"), how="left")

n = annot_v101.groupby(["surface_clean", "smoothness_clean"]).size()
tp = (annot_v101[(annot_v101.surface == annot_v101.surface_clean) & (annot_v101.smoothness == annot_v101.smoothness_clean)]
        .groupby(["surface_clean", "smoothness_clean"]).size())

In [168]:
osmt = pd.DataFrame([tp, n]).T
osmt.columns = ["tp", "n"]
osmt["tp/n"] = round((osmt.tp / osmt.n)*100, 2)
osmt

Unnamed: 0_level_0,Unnamed: 1_level_0,tp,n,tp/n
surface_clean,smoothness_clean,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
asphalt,bad,88,354,24.86
paving_stones,bad,21,207,10.14
paving_stones,intermediate,165,510,32.35


In [15]:
# def plot_error_hist(df_no_na, surface, smoothness):
#     print(f"Class {surface} - {smoothness}")
#     temp = df_no_na[(df_no_na.preselection_type == surface) & (df_no_na.prediction == smoothness)]
#     plt.hist(temp.squared_error)

In [16]:
#plot_error_hist(df_no_na, "paving_stones", "bad")

In [17]:
#plot_error_hist(df_no_na, "paving_stones", "intermediate")

In [18]:
#plot_error_hist(df_no_na, "asphalt", "bad")

In [19]:
#df_old = pd.read_csv("/Users/alexandra/Nextcloud-HTW/SHARED/SurfaceAI/data/mapillary_images/automated_labeling_experiments/gpt_experiments/results/experiment_2_V200_paving_stones_batch_1.csv")
#df_old = df_old[df_old.prediction == "intermediate"]
#df_new = pd.read_csv("/Users/alexandra/Nextcloud-HTW/SHARED/SurfaceAI/data/mapillary_images/automated_labeling_experiments/gpt_experiments/results/experiment_2_V200_paving_stones_batch_2.csv")
#df_new = df_new[df_new.prediction == "intermediate"]
#missings = df_new[~df_new.image_id.isin(df_old.image_id)]
#missings.to_csv("missing_paving_stones_intermediate.csv")