In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import ast

spark_file = r'./results/pyspark_results.txt'
python_file = r'./results/python_results.txt'

In [None]:
def load_spark_data(spark_file):
    with open(spark_file, 'r') as file:
        data = [ast.literal_eval(line.strip()) for line in file]

    # Get results in a pandas df
    df = pd.DataFrame(data)

    # Expand the results column
    expanded_df = pd.json_normalize(df['results'])

    # Concatenate the expanded columns with the original DataFrame
    df = (pd.concat([df, expanded_df], axis=1)
        .drop('results', axis=1)
        .drop_duplicates())

    # For spark only
    rename_cols = {
        'Micro F1 Score': 'micro_f1',
        'Macro F1 Score': 'macro_f1'
    }
    df = df.rename(columns = rename_cols)

    df['f1'] = df['modified_f1'].fillna(df['macro_f1'])

    return df, 'spark'

def load_python_data(python_file):

    with open(python_file, 'r') as file:
        data = [ast.literal_eval(line.strip()) for line in file]

    transformed_data = []
    for item in data:
        if 'BinaryRelevance' in item.keys() or 'LabelPowerset' in item.keys() or 'ClassifierChain' in item.keys():
            method = list(item.keys())[0]  # Get the method name

            # Create a new dictionary with 'method' key and the rest of the keys
            new_dict = {
                'method': method,
                **item[method]
            }
        else:
            new_dict = item
        
        transformed_data.append(new_dict)

    # Get results in a pandas df
    df = pd.DataFrame(transformed_data)

    # Expand the results column
    expanded_df = pd.json_normalize(df['results'])

    # Concatenate the expanded columns with the original DataFrame
    df = (pd.concat([df, expanded_df], axis=1)
        .drop('results', axis=1)
        .drop_duplicates())

    df['f1'] = df['modified_f1'].fillna(df['macro_f1']).round(4)
    df['classifier'] = df.apply(
        lambda row: f"{row['method']}-{row['classifier']}" if pd.notnull(row['method']) else row['classifier'], 
        axis=1
    )
    df = df.drop('method', axis=1)
    df = df.rename(columns = {'frequncy_threshold': 'frequency_threshold'})

    return df, 'python'

In [None]:
# Get results data
# df, which = load_spark_data(spark_file)
df, which = load_python_data(python_file)
df

First of all we want to plot the performance (F1) of each method against the other methods with standard parameter values which we take them to be 0.5 for the frequency threshold, 5 for k in kNN and 10 for k in KMeans. We want the results for the maximum common number of points we achieved for all methods combined (i.e. if method X has 5k train points as its max ans method Y has 1k points as its max, we take 1k).

Then, for each method we want to have the following plots:

1. Train and prediction time vs number of points.

2. F1 vs number of points.

3. F1 vs hyperparameters (frequency threshold, k in kNN or k-Means)

## F1 w/ std param values for all methods

In [None]:
STD_FREQ_THRESHOLD = 0.5
STD_K = 5
STD_NUM_CLUSTERS = 10

def get_freq_mask(df):
    global STD_FREQ_THRESHOLD
    return (df['frequency_threshold'] == STD_FREQ_THRESHOLD)

def get_null_freq_mask(df):
    return (df['frequency_threshold'].isnull())

def get_kmeans_mask(df):
    global STD_NUM_CLUSTERS
    freq_mask = get_freq_mask(df)
    return (df['classifier'] == 'KMeansFreq') & (df['num_clusters'] == STD_NUM_CLUSTERS) & freq_mask

def get_knn_mask(df):
    global STD_K
    freq_mask = get_freq_mask(df)
    return (df['classifier'] == 'NearestNeighbors') & (df['k'] == STD_K) & freq_mask

def get_max_common_train_points(df):
    return df.groupby('classifier')['train_points'].max().min()

def get_train_points_mask(df):
    max_common_train_points = get_max_common_train_points(df)
    return df['train_points'] == max_common_train_points


def get_overall_performance_df(df):
    train_points_mask = get_train_points_mask(df)
    df_ = df[train_points_mask]

    kmeans_mask = get_kmeans_mask(df_)
    knn_mask = get_knn_mask(df_)
    base_methods_mask = get_null_freq_mask(df_)

    plot_df = pd.concat([
        df_[kmeans_mask],
        df_[knn_mask],
        df_[base_methods_mask]
    ]).reset_index()

    
    plot_df = plot_df[['classifier', 'f1']].sort_values(by = 'f1')
    return plot_df


In [None]:
# Overall performance data
plot_df = get_overall_performance_df(df)

# Overall performance plot
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Bar(
    x=plot_df['f1'],
    y=plot_df['classifier'],
    orientation='h',
    text=plot_df['f1'],  # Set the labels for the bars
    textposition='inside'  # Set the position of the labels inside the bars
))

fig.update_layout(
    title='Overall performance for standard parameter values',
    xaxis_title='F1 Score',
    yaxis_title='Classifier',
    height=500
)

fig.show()


## Train/Prediction time vs Number of points

In [None]:
def runtime_vs_num_points_plot(df, group_col, time_col):

    # Group by "train_points" and "frequency_threshold" and calculate the average "train_time" for each group
    df_grouped = df.groupby(['train_points', group_col])[time_col].mean().reset_index()

    # Pivot the dataframe to have "frequency_threshold" values as columns
    df_pivot = df_grouped.pivot(index='train_points', columns=group_col, values=time_col)

    return df_pivot

### Label Powerset

In [None]:
# Train and prediction time vs number of points
base_methods_mask = get_null_freq_mask(df)
base_methods_df = df[base_methods_mask]
group_col = 'classifier'

# Train time
time_col = 'train_time'
plot_df = runtime_vs_num_points_plot(base_methods_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Train Time')
plt.title('Train time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()


# Prediction time
time_col = 'prediction_time'
plot_df = runtime_vs_num_points_plot(base_methods_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Prediction Time')
plt.title('Prediction time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()


### k-Means

In [None]:
# Train and prediction time vs number of points
group_col = 'frequency_threshold'
kmeans_df = df[(df['classifier'] == 'KMeansFreq') & (df['num_clusters'] == STD_NUM_CLUSTERS)]
time_col = 'train_time'
plot_df = runtime_vs_num_points_plot(kmeans_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Train Time')
plt.title('Train time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()

# Prediction time
time_col = 'prediction_time'
plot_df = runtime_vs_num_points_plot(kmeans_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Prediction Time')
plt.title('Prediction time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()


### kNN

In [None]:
# Train and prediction time vs number of points
knn_df = df[(df['classifier'] == 'NearestNeighbors') & (df['k'] == STD_K)]
time_col = 'train_time'
group_col = 'frequency_threshold'
plot_df = runtime_vs_num_points_plot(knn_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Train Time')
plt.title('Train time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()

# Prediction time
time_col = 'prediction_time'
plot_df = runtime_vs_num_points_plot(knn_df, group_col, time_col)

# Plot the bar plot
ax = plot_df.plot(kind='bar')
plt.xlabel('Number of points')
plt.ylabel('Prediction Time')
plt.title('Prediction time vs Number of points and Frequency threshold')

# Rotate x-axis tick labels
plt.xticks(rotation=0)

plt.show()

## f1 vs Number of points

In [None]:
# Get base methods df
base_methods_mask = get_null_freq_mask(df)
base_methods_df = df[base_methods_mask]
base_methods_df = base_methods_df[['classifier', 'train_points', 'f1']].reset_index(drop = True)
if which == 'python':
    base_methods_df = base_methods_df.groupby(['classifier', 'train_points']).agg({'f1': 'mean'}).reset_index()

kmeans_mask = (df['classifier'] == 'KMeansFreq') & (df['num_clusters'] == STD_NUM_CLUSTERS) & (df['frequency_threshold'] == STD_FREQ_THRESHOLD)
kmeans_df = df[kmeans_mask]
kmeans_df = kmeans_df[['classifier', 'train_points', 'f1']].reset_index(drop = True)

knn_mask = (df['classifier'] == 'NearestNeighbors') & (df['k'] == STD_K) & (df['frequency_threshold'] == STD_FREQ_THRESHOLD)
knn_df = df[knn_mask]
knn_df = knn_df[['classifier', 'train_points', 'f1']].reset_index(drop = True)

plot_df = pd.concat([
        base_methods_df,
        kmeans_df,
        knn_df
    ])

# Group by classifier column
grouped = plot_df.groupby('classifier')

# Plotting
plt.figure(figsize=(10, 6))

for classifier, group in grouped:
    if which == 'python':
        plt.plot(group['train_points'].sort_values(), group['f1'], marker='o', linestyle='-', label=classifier)
    else:
        plt.plot(group['train_points'], group['f1'], marker='o', linestyle='-', label=classifier)

plt.xlabel('Train Points')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Train Points')
plt.legend()
plt.grid(True)

# Set x-axis tick positions and labels
if which == 'python':
    plt.xticks(list(plot_df['train_points'].value_counts().index))
else:
    plt.xticks(range(plot_df['train_points'].min(), plot_df['train_points'].max() + 1000, 1000))


plt.show()

## f1 vs hyperparameters

### kMeans

In [None]:
# Get base methods df
kmeans_mask = (df['classifier'] == 'KMeansFreq') & (df['num_clusters'] == STD_NUM_CLUSTERS)
kmeans_df = df[kmeans_mask]
kmeans_df = kmeans_df[['train_points', 'f1', 'frequency_threshold']].reset_index(drop = True)

# Group by classifier column
grouped = kmeans_df.groupby('frequency_threshold')

# Plotting
plt.figure(figsize=(10, 6))

for freq, group in grouped:
    plt.plot(group['train_points'], group['f1'], marker='o', linestyle='-', label=freq)

plt.xlabel('Train Points')
plt.ylabel('F1 Score')
plt.title('kMeans: F1 Score vs Train Points and Frequency threshold')
plt.legend(title = 'Frequency Threshold')
plt.grid(True)

# Set x-axis tick positions and labels
if which == 'python':
    plt.xticks(list(kmeans_df['train_points'].value_counts().index))
else:
    plt.xticks(range(kmeans_df['train_points'].min(), kmeans_df['train_points'].max() + 1000, 1000))

plt.show()

In [None]:
# Get base methods df
kmeans_mask = (df['classifier'] == 'KMeansFreq') & (df['frequency_threshold'] == STD_FREQ_THRESHOLD)
kmeans_df = df[kmeans_mask].groupby('num_clusters')['f1'].mean().reset_index()
kmeans_df = kmeans_df[['f1', 'num_clusters']].reset_index(drop = True)
kmeans_df

# Plotting
plt.figure(figsize=(10, 6))

plt.plot(kmeans_df['num_clusters'], kmeans_df['f1'], marker='o', linestyle='-')

plt.xlabel('Number of Clusters')
plt.ylabel('F1 Score')
plt.title('kMeans: F1 Score vs Number of Clusters')
plt.grid(True)

plt.show()

### kNN

In [None]:
knn_mask = (df['classifier'] == 'NearestNeighbors') & (df['k'] == STD_K)
knn_df = df[knn_mask]
knn_df = knn_df[['train_points', 'f1', 'frequency_threshold']].reset_index(drop = True)
knn_df

# Group by classifier column
grouped = knn_df.groupby('frequency_threshold')

# Plotting
plt.figure(figsize=(10, 6))

for freq, group in grouped:
    if which == 'python':
        plt.plot(group['train_points'].sort_values(), group['f1'], marker='o', linestyle='-', label=freq)
    else:
        plt.plot(group['train_points'], group['f1'], marker='o', linestyle='-', label=freq)

plt.xlabel('Train Points')
plt.ylabel('F1 Score')
plt.title('kNN: F1 Score vs Train Points and Frequency threshold')
plt.legend(title = 'Frequency Threshold')
plt.grid(True)

# Set x-axis tick positions and labels
if which == 'python':
    plt.xticks(list(knn_df['train_points'].value_counts().index))
else:
    plt.xticks(range(knn_df['train_points'].min(), knn_df['train_points'].max() + 1000, 1000))

plt.show()