In [5]:
import matplotlib.pyplot as plt
import numpy as np

`scores_csvs` is the list of screening csv that were outputted from `screen_nani.py`. <br>
The output of this notebook will also be the same directory as the input csvs.

In [6]:
scores_csvs = ['outputs/10comp_sim_summary.csv']

##### Potential Errors
- Please remember to remove the row with `None,None` in the screening csv if there is an error.
- Another reason for error is if there are one or less optimum 2<sup>nd</sup> derivative number of clusters. <br>
    - If there are one optimum 2nd derivative number of clusters, comment out line #$17, 23, 45$. <br>
    - If there are no optimum 2nd derivative number of clusters, comment out the above lines and line #$16, 22, 44$. <br>

In [7]:
def plot_scores(scores_csv):
    n_clus, db = np.loadtxt(scores_csv, unpack=True, delimiter=',', usecols=(0, 3))
    arr = db
    x = n_clus[1:-1]
    result = []

    # Calculate the second derivative (before + after - 2*current)
    for start_index, n_clusters in zip(range(1, len(arr) - 1), x):
        temp = arr[start_index + 1] + arr[start_index - 1] - (2 * arr[start_index])
        if arr[start_index] <= arr[start_index - 1] and arr[start_index] <= arr[start_index + 1]:
            result.append((n_clusters, temp))
    result = np.array(result)

    sorted_indices = np.argsort(result[:, 1])[::-1]
    sorted_result = result[sorted_indices]
    min_x = sorted_result[0][0]
    sec_min_x = sorted_result[1][0]

    # Plot the second derivative and the optimal number of clusters
    plt.scatter(sorted_result[:, 0], sorted_result[:, 1])
    plt.axline((x[0], 0), slope=0, color='black', linestyle='-')
    plt.axvline(x=min_x, color='#de8200', linestyle='--', label=f'Optimal 2nd deriv Number of Clusters: {int(min_x)}')
    plt.axvline(x=sec_min_x, color='#6400ab', linestyle='--', label=f'Second Optimal 2nd deriv Number of Clusters: {int(sec_min_x)}')
    base_name = scores_csv.split('\\')[-1].split('.csv')[0]
    plt.title(base_name)
    plt.legend()
    plt.close()
    
    # Plot the Davies-Bouldin score and the optimal number of clusters
    all_indices = np.argsort(db)
    min_db_index = all_indices[0]
    min_db = n_clus[min_db_index]
    all_indices = np.delete(all_indices, 0)
    second_min_index = all_indices[0]
    second_min_db = n_clus[second_min_index]
    fig, ax = plt.subplots()

    ax.plot(n_clus, db, color='#005cde', label='Davies-Bouldin')
    ax.set_xlabel('Number of Clusters')
    ax.set_ylabel('Davies-Bouldin Score')

    ax.axvline(x=min_db, color='#de005c', linestyle='--', label=f'Optimal Number of Clusters: {int(min_db)}')
    ax.axvline(x=second_min_db, color='#00ab64', linestyle='--', label=f'Second Optimal Number of Clusters: {int(second_min_db)}')
    ax.axvline(x=min_x, color='#de8200', linestyle='--', label=f'Optimal 2nd deriv Number of Clusters: {int(min_x)}')
    ax.axvline(x=sec_min_x, color='#6400ab', linestyle='--', label=f'Second Optimal 2nd deriv Number of Clusters: {int(sec_min_x)}')
    base_name = scores_csv.split('\\')[-1].split('.csv')[0]
    plt.title(base_name)
    ax.legend()
    plt.savefig(f'{base_name}_db.png', dpi=500, bbox_inches='tight', pad_inches=0.1, transparent=True)
    plt.close()
    
if __name__ == '__main__':
    for scores_csv in scores_csvs:
        plot_scores(scores_csv)