In [1]:
import itertools
import os
import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import scipy.stats as stats
import seaborn as sns

pp = pprint.PrettyPrinter(indent=4)

print("Using pandas %s version" % pd.__version__)
print("Using seaborn %s version" % sns.__version__)
print("Using scipy %s version" % scipy.__version__)

Using pandas 1.2.0 version
Using seaborn 0.11.1 version
Using scipy 1.6.0 version


In [2]:
splits = [
    "RandomKFold",
    "GroupShuffleSplit",
    "ShuffleSplit",
    "StratifiedShuffleSplit",
    "StratifiedKFold",
    "KFold",
    "TimeSeriesSplit",
]

splitsWithoutGroupShuffleSplit = list(
    filter(lambda x: (x != "GroupShuffleSplit"), splits)
)

data_dir = r"C:\Projects\RecSys2020\results\new\\"

imfFull = "#548235"
imfTest = "#99C979"

knnFullTest = "#C55A11"
normalizedKnnFull = "#4A76C6"
normalizedKnnTest = "#19C3FF"

average = "#FFC000"
popularity = "#F18F8F"
random = "#BFBFBF"

# colors = ('red','blue','green','magenta', 'black', 'purple', 'grey', 'orange')
rec_colors = (
    imfFull,
    imfTest,
    knnFullTest,
    normalizedKnnFull,
    normalizedKnnTest,
    average,
    popularity,
    random,
)


def split_list(a_list):
    half = len(a_list) // 2
    return a_list[:half], a_list[half:]

In [3]:
def plot_figure1(split, ax1, ax2):
    print(split)
    # ax1 = ax[i,j]
    ax1.set_title(split)

    # ax2 = ax[i,j+1]
    ax2.set_title(split)

    fig1 = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure1.txt",
        sep="\t",
        header=4,
    )
    f1 = np.array(fig1)
    df = pd.DataFrame(index=["Full", "Test"])
    for i, r in enumerate(fig1.Recommender):
        df[r] = f1[i][1:]

    ax00 = df.plot(kind="bar", color=rec_colors, legend=None, ax=ax1)
    # ,figsize=(8,12)
    ax00.set_ylabel("P@10")
    ax00.xaxis.set_tick_params(rotation=0)
    # plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title='Recommender', title_fontsize='xx-large')

    data = [
        list(a)
        for a in zip(
            fig1.Recommender,
            fig1.Full.rank(ascending=False),
            fig1.Test.rank(ascending=False),
        )
    ]

    artists = []
    for row, color in zip(data, rec_colors):
        artist = plt.Line2D(
            xdata=[1, 2], ydata=[row[1:]], lw=1, color=color, marker="o"
        )
        ax2.add_artist(artist)
        artists.append(artist)

    ax2.set_ybound([0.5, 8.3])
    ax2.set_xbound([0.94, 2.06])
    ax2.spines["top"].set_visible(False)
    ax2.spines["right"].set_visible(False)
    ax2.spines["bottom"].set_visible(False)
    ax2.spines["left"].set_visible(False)
    ax2.invert_yaxis()
    ax2.set_xticks([1, 2])
    ax2.set_xticklabels(["Full", "Test"])

    ax2.set_ylabel("System ranking")

    return (artists, fig1.Recommender)

In [15]:
print("figure1.cross-validation.png\n")
fig, ax = plt.subplots(len(splitsWithoutGroupShuffleSplit) // 2, 2 * 2)
firstColumn, secondColumn = split_list(splitsWithoutGroupShuffleSplit)

for i, split in enumerate(firstColumn):
    ax1 = ax[i, 0]
    ax2 = ax[i, 1]
    plot_figure1(split, ax1, ax2)

for i, split in enumerate(secondColumn):
    ax1 = ax[i, 2]
    ax2 = ax[i, 3]
    artists, recommenders = plot_figure1(split, ax1, ax2)

fig.set_size_inches(20, 15)
plt.legend(
    artists,
    recommenders,
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    title="Recommender",
    title_fontsize="xx-large",
)
fig.tight_layout()

png = os.path.join(data_dir, "figure1.cross-validation.png")
fig.savefig(png, format="png", dpi=300)
# plt.show()
plt.close("all")

figure1.cross-validation.png

RandomKFold
ShuffleSplit
StratifiedShuffleSplit
StratifiedKFold
KFold
TimeSeriesSplit


In [16]:
print("figure1.cross-validation.b.png\n")
fig1splits = list(filter(lambda x: (x != "GroupShuffleSplit"), splits))
# fig, ax = plt.subplots(int(len(fig1splits)/2), 2 * 2)
def split_list(a_list):
    half = len(a_list) // 2
    return a_list[:half], a_list[half:]


firstColumn, secondColumn = split_list(fig1splits)
fig = plt.figure(figsize=(8, 6))
# ax0.plot(x, y)
# ax1.plot(y, x)

# plt.tight_layout()
# plt.show()
shape = (3, 9)
for i, split in enumerate(firstColumn):
    #print(i)
    ax1 = plt.subplot2grid(shape, (i, 0), colspan=2)
    ax2 = plt.subplot2grid(shape, (i, 2), colspan=1)
    plot_figure1(split, ax1, ax2)

for i, split in enumerate(secondColumn):
    #print(i)
    ax1 = plt.subplot2grid(shape, (i, 4), colspan=2)
    ax2 = plt.subplot2grid(shape, (i, 6))
    artists, recommenders = plot_figure1(split, ax1, ax2)

fig.set_size_inches(20, 15)
plt.legend(
    artists,
    recommenders,
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    title="Recommender",
    title_fontsize="xx-large",
)
fig.tight_layout()

png = os.path.join(data_dir, "figure1.cross-validation.b.png")
#fig.savefig(png, format="png", dpi=300)
# plt.show()
plt.close("all")

figure1.cross-validation.b.png

RandomKFold
ShuffleSplit
StratifiedShuffleSplit
StratifiedKFold
KFold
TimeSeriesSplit


In [17]:
# fig3
def rank(data):
    artists = []
    data.set_index("Target size")
    for i, j in data.groupby(["Target size"], axis=0):
        del j["Target size"]
        rank = j.rank(axis=1, ascending=False)
        artist = np.array(rank)[0]
        artists.append(artist)

    rank = []
    for i, t in enumerate(data.keys()[1:]):
        rank.append([t] + np.array(artists)[:, i].tolist())
    return rank


def plot_system_rankings(data, ax, xlabel="|N_u|", ylabel=""):
    artists = []
    for row, color in zip(data, rec_colors):
        x = list(range(len(data[0]) - 1))
        artist = plt.Line2D(xdata=x, ydata=[row[1:]], lw=1, color=color, marker="o")
        ax.add_artist(artist)
        artists.append(artist)

    ax.set_ybound([0.8, 9])
    ax.set_xbound([-0.2, 13.06])

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.set_xticks(list(range(len(data[0]) - 1)))
    ax.set_xticklabels(
        list(itertools.chain(precision["Target size"].array[:-1], ["Full"]))
    )
    # plt.xticks(list(range(len(ndcg_rank[0])-1)), ndcg['Target size'].array)
    ax.set_yticks(list(range(1, 9)))
    ax.invert_yaxis()
    ax.invert_xaxis()

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    return artists

In [18]:
print("plot_metrics")


def plot_metrics(
    data, ax, xlabel="|N_u|", ylabel="", twinx=None, title=None, colors=None
):
    if colors is None:
        colors = rec_colors

    artists = []
    for column_title, color in zip(data.columns[1:], colors):
        x = list(range(len(data[column_title])))
        artist = plt.Line2D(
            xdata=x, ydata=[data[column_title]], lw=1, color=color, marker="o"
        )
        ax.add_artist(artist)
        artists.append(artist)

    max = np.array((data.max()[1:])).max()
    ax.set_ybound([0, max + max * 0.1 + 0.0000001])
    ax.set_xbound([-0.1, 13.06])

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.set_xticks(list(range(len(data["Target size"]))))
    ax.set_xticklabels(list(itertools.chain(data["Target size"].array[:-1], ["Full"])))
    ax.invert_xaxis()

    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    if twinx is not None and twinx.max() > 0:
        label = "Sum of p-values"
        ax2 = ax.twinx()
        (e,) = ax2.plot(
            list(range(len(twinx))),
            twinx,
            label=label,
            lw=1,
            color="black",
            fillstyle="none",
            markeredgecolor="black",
            marker="s",
        )
        ax2.set_ylabel(label)
        max = twinx.max()
        ax2.set_ylim(0, max + 0.1 * max + 0.01)

        ax2.spines["top"].set_visible(False)
        ax2.spines["right"].set_visible(False)
        ax2.spines["bottom"].set_visible(False)
        ax2.spines["left"].set_visible(False)
        # ax2.legend(loc=1)
        # artists.append(ax2)
        artists.append(e)

    if title is not None:
        ax.set_title(title)

    ax.set_xlabel("|Nu|")
    return artists

plot_metrics


In [None]:
print("figure3\n")
for split in splitsWithoutGroupShuffleSplit:
    print(split)
    ndcg = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure3.txt",
        sep="\t",
        header=4,
        nrows=13,
    )
    precision = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure3.txt",
        sep="\t",
        header=19,
        nrows=13,
    )
    recall = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure3.txt",
        sep="\t",
        header=34,
        nrows=13,
    )

    ndcg_rank = rank(ndcg)
    precision_rank = rank(precision)
    recall_rank = rank(recall)

    cols = ["Column {}".format(col) for col in range(1, 2)]
    rows = ["Row {}".format(row) for row in ["Precision@10", "Recall@10", "nDCG@10"]]

    fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 8))

    # for ax, col in zip(axes[0], cols):
    #    ax.set_title(col)
    #    #ax.set_xlabel('|Nu|')

    for ax, row in zip(axes[:, 0], rows):
        ax.set_ylabel(row, rotation=90, size="large")

    plot_metrics(precision, axes[0][0], ylabel="Precision@10")
    plot_metrics(recall, axes[1][0], ylabel="Recall@10")
    artists = plot_metrics(ndcg, axes[2][0], ylabel="nDCG@10")

    plot_system_rankings(precision_rank, axes[0][1])
    plot_system_rankings(recall_rank, axes[1][1])
    artists = plot_system_rankings(ndcg_rank, axes[2][1])

    plt.legend(
        artists,
        ndcg.columns[1:],
        bbox_to_anchor=(1.05, 1),
        loc="upper left",
        title="Recommender",
        title_fontsize="x-large",
    )
    fig.suptitle(split)

    fig.tight_layout()
    png = os.path.join(data_dir, "figure3." + split + ".png")
    fig.savefig(png, format="png", dpi=300)
    # dpi=1200
# plt.show()
plt.close("all")

figure3

RandomKFold
ShuffleSplit
StratifiedShuffleSplit
StratifiedKFold


In [None]:
def fill_correlation(p, r, n):
    p = p.drop(index=[10, 11])
    r = r.drop(index=[10, 11])
    n = n.drop(index=[10, 11])

    CorrelationnDCG = [
        0.642857143,
        0.714285714,
        0.785714286,
        0.857142857,
        0.857142857,
        0.857142857,
        0.928571429,
        0.857142857,
        0.857142857,
        0.785714286,
        0.714285714,
    ]
    CorrelationPreciion = [
        0.642857143,
        0.642857143,
        0.642857143,
        0.714285714,
        0.714285714,
        0.785714286,
        0.857142857,
        0.928571429,
        0.785714286,
        0.785714286,
        0.714285714,
    ]
    CorrelationRecall = [
        0.642857143,
        0.642857143,
        0.642857143,
        0.642857143,
        0.714285714,
        0.714285714,
        0.857142857,
        0.928571429,
        0.785714286,
        0.857142857,
        0.785714286,
    ]
    p["Correlation with unbiased evaluatio"] = CorrelationPreciion
    r["Correlation with unbiased evaluatio"] = CorrelationRecall
    n["Correlation with unbiased evaluatio"] = CorrelationnDCG

    def reoder_columns(d):
        cols = d.columns.tolist()
        cols = [cols[0]] + cols[-1:] + cols[1:-1]
        d = d[cols]
        return d

    p = reoder_columns(p)
    r = reoder_columns(r)
    n = reoder_columns(n)
    return (p, r, n)

In [None]:
print("figure4\n")
for split in splitsWithoutGroupShuffleSplit:
    print(split)
    ndcg = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure4.txt.ml1m.txt",
        sep="\t",
        header=4,
        nrows=13,
    )
    precision = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure4.txt.ml1m.txt",
        sep="\t",
        header=19,
        nrows=13,
    )
    recall = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure4.txt.ml1m.txt",
        sep="\t",
        header=34,
        nrows=13,
    )

    cols = ["Column {}".format(col) for col in range(1, 2)]
    rows = ["Row {}".format(row) for row in ["Precision@10", "Recall@10", "nDCG@10"]]

    fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(14, 8))

    for ax, row in zip(axes, rows):
        ax.set_ylabel(row, rotation=90, size="large")

    twinx = "Sum of p-values"
    CorrelationWithUnbiasedEvaluation = "#C00000"
    ExpectedIntersectionRatioInTopN = "#FFC000"
    RatioOfTies = "#3333FF"
    RatioOfTiesAtZero = "#006600"
    fig4colors = (
        CorrelationWithUnbiasedEvaluation,
        ExpectedIntersectionRatioInTopN,
        RatioOfTies,
        RatioOfTiesAtZero,
    )

    precision, recall, ndcg = fill_correlation(precision, recall, ndcg)

    plot_metrics(
        precision.iloc[:, :-1],
        axes[0],
        ylabel="Precision@10",
        twinx=precision[twinx],
        colors=fig4colors,
    )
    plot_metrics(
        recall.iloc[:, :-1],
        axes[1],
        ylabel="Recall@10",
        twinx=recall[twinx],
        colors=fig4colors,
    )
    artists = plot_metrics(
        ndcg.iloc[:, :-1],
        axes[2],
        ylabel="nDCG@10",
        twinx=ndcg[twinx],
        colors=fig4colors,
    )

    plt.legend(artists, precision.columns[1:], bbox_to_anchor=(1.05, 1), loc="best")

    fig.suptitle(split)
    png = os.path.join(data_dir, "figure4.txt.ml1m." + split + ".png")
    fig.set_size_inches(8, 6)
    fig.tight_layout()
    fig.savefig(png, format="png", dpi=300)
    # dpi=1200
# plt.show()
plt.close("all")

In [None]:
print("figure5.cross-validation.png\n")
fig, axes = plt.subplots(nrows=len(splits), ncols=1, figsize=(8, 10))
for split, ax in zip(splits, axes):
    print(split)
    coverage = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure5.txt",
        sep="\t",
        header=4,
        nrows=13,
    )

    plot_metrics(coverage, ax, ylabel="Coverage@10", title=split)


plt.legend(
    artists,
    coverage.columns[1:],
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    title="Recommender",
    title_fontsize="x-large",
)

png = os.path.join(data_dir, "figure5.cross-validation.png")
# fig.suptitle(split)
fig.set_size_inches(10, 20)
fig.tight_layout()
fig.savefig(png, format="png", dpi=300)
# dpi=1200
# plt.show()
plt.close("all")

In [None]:
print("figure5.cross-validation.b.png\n")
fig, axs = plt.subplots(len(splitsWithoutGroupShuffleSplit) // 2, 2)
firstColumn, secondColumn = split_list(splitsWithoutGroupShuffleSplit)

for i, split in enumerate(firstColumn):
    coverage = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure5.txt",
        sep="\t",
        header=4,
        nrows=13,
    )
    print(split, i)
    ax = axs[i, 0]
    plot_metrics(coverage, ax, ylabel="Coverage@10", title=split)

for i, split in enumerate(secondColumn):
    coverage = pd.read_table(
        r"C:\Projects\RecSys2020\results\\" + split + r"\figure5.txt",
        sep="\t",
        header=4,
        nrows=13,
    )
    print(split, i)
    ax = axs[i, 1]
    artists = plot_metrics(coverage, ax, ylabel="Coverage@10", title=split)

fig.set_size_inches(14, 8)

plt.legend(
    artists,
    coverage.columns[1:],
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    title="Recommender",
    title_fontsize="xx-large",
)
# plt.legend(artists, coverage.columns[1:],bbox_to_anchor=(-1.4, 0.02, 2.5, 1.102), loc='lower left',
#           ncol=8, mode="expand", borderaxespad=0.1,labelspacing=.001)
fig.tight_layout()
png = os.path.join(data_dir, "figure5.cross-validation.b.png")
fig.savefig(png, format="png", dpi=300)
# plt.show()
plt.close("all")

In [None]:
plt.close("all")