In [49]:
import numpy as np
import pandas as pd
import os
import plotly
import plotly.express as px
import re
import sys
import tensorflow as tf
import time

from collections import Counter
from itertools import combinations
from MulticoreTSNE import MulticoreTSNE as TSNE
from pathlib import Path
from sklearn.decomposition import PCA
from typing import List

# ---------- Import own python modules ----------
project_dir = Path(os.path.abspath("")).parents[0]
sys.path.append(str(project_dir))

import helper.variables as vars

from database.genre import Genres
from helper.file_system_interaction import load_object_from_file, save_object_in_file

In [43]:
# Define constants like seed etc.
SEED = 1234
cpu_kernel_for_tsne = 16
# nan_movies = []

# Define column names for DataFrames
col_colours = "colour"
col_colour_labels = "colour_labels"

# Set seed
np.random.seed(SEED)

# Define variables
max_data_range = [100000]
train_data_relationship_range = [0.85]
history_len_range = [30, 100]
min_history_len_range = [20, 50]
number_of_predicted_movies_range = [1, 2, 10]
fill_history_len_with_zero_movies_range = [False, True]
fine_grained_extracting_range = [False]

In [3]:
# Read data from database
all_genres = Genres().get_all()
genre_names = np.array([genre["name"] for genre in all_genres.values()])

In [51]:
def remove_duplicates(arr: pd.Series) -> List[str]:
    seen = set()
    res = []

    for colour_value, colour_label in arr:
        label = str(colour_label)
        if label not in seen:
            res.append((colour_value, label))
            seen.add(label)

    return res


def visualize_data_with_pca_or_tsne(path_to_store_tsne_dim_reduced_data: Path, components: int, df_original: pd.DataFrame, use_pca: bool=True,
                                    time_dependence=False, point_size: int=10, title: str="", ignore_columns: List[str]=[]) -> np.ndarray:
    """
        Computes "components" many principal components for the passed
        DataFrame "df" and plots the results. For this the number of
        components must be 2 for 2D or 3 for 3D.
        If column "colour" and "colour_labels" are already defined,
        then the predfined ones will be used.
        Pass is PCA should be used to compute principal components (True)
        or if t-SNE should be used (False).
    """

    global col_colours, col_colour_labels, cpu_kernel_for_tsne

    assert 2 <= components <= 3

    # Define some variables
    col_size = "size"
    custom_data = ["movie"] if "movie" in df_original.columns else []  # For labeling points with movie names

    # Compute principal components
    tmp_df = df_original.loc[:, df_original.columns != col_colour_labels]  # Ignore column colour for computing the principal components
    tmp_df = df_original[[col for col in df_original.columns if col not in [col_colours, col_colour_labels] + ignore_columns]]

    if use_pca:
        model = PCA(n_components=components, random_state=SEED)
    else:
        model = TSNE(n_components=components, n_jobs=cpu_kernel_for_tsne, random_state=SEED)  # Initialization "random" only supported

    if not use_pca and os.path.exists(path_to_store_tsne_dim_reduced_data):  # Load t-SNE dimension reduced data from file
        res = load_object_from_file(path_to_store_tsne_dim_reduced_data)
    else:  # Compute t-SNE dimension reduced data from file
        res = model.fit_transform(tmp_df.values)

        if not use_pca:  # Save only with t-SNE
            save_object_in_file(path_to_store_tsne_dim_reduced_data, res)  # Save dimension reduced data in file
    df = pd.DataFrame(data=res, columns=[f"c{i}" for i in range(len(res[0]))])

    if col_colours in df_original.columns:  # Use colours from original DataFrame
        df[col_colours] = df_original[col_colours].values  # Use colours from original DataFrame

    if use_pca:
        print(f"Variance/Amount of left/remaining information: {model.explained_variance_ratio_}, lost information: {1 - sum(model.explained_variance_ratio_)}")
        print(f"Variance: {model.explained_variance_}")

    # Find all labels (= genres) for different colours (no duplicates)
    colour_labels_and_colour_values = remove_duplicates(df_original[[col_colours, col_colour_labels]].values)  # Remove duplicates
    colour_labels_and_colour_values = sorted(colour_labels_and_colour_values, key=lambda x: x[0])  # Sort labels by colour value
    colour_labels = [label for _, label in colour_labels_and_colour_values]
    colour_labels_str = [str(colour_label) for colour_label in colour_labels]
    colours_ticks = [colour_value for colour_value, _ in colour_labels_and_colour_values]
    print("Colour labels:\n", colour_labels)
    print("Colour ticks:\n", colours_ticks)

    # Set size and colour for each points
    df[col_size] = [point_size for _ in range(df.shape[0])]  # Add size of each data point

    if col_colours not in df.columns:  # Only add colours if it is not predefined
        map_colour_labels_to_colours = dict(zip(colour_labels_str, colours))
        df[col_colours] = [map_colour_labels_to_colours[str(colour_label)] for colour_label in df_original[col_colour_labels]]

    # Add names of movies to dimension reduced data
    if "movie" in df_original.columns:
        df["movie"] = df_original["movie"].values

    # Plot graph
    if components == 3:
        fig = px.scatter_3d(df, x="c0", y="c1", z="c2", size=col_size, color=col_colours, custom_data=custom_data)
    else:
        if time_dependence:  # Plot 3D with x-axis as time
            df["time"] = [i for i in range(1, df.shape[0] + 1)]
            fig = px.scatter_3d(df, x="time", y="c0", z="c1", size=col_size, color=col_colours, custom_data=custom_data)
        else:
            fig = px.scatter(df, x="c0", y="c1", size=col_size, color=col_colours, custom_data=custom_data)

    # Set hover for points
    if "movie" in df_original.columns:
        fig.update_traces(
            hovertemplate="<br>".join([
                "%{customdata}",
            ])
        )

    # Set colour bar
    fig.update_coloraxes(colorbar_tickvals=colours_ticks, colorbar_ticktext=colour_labels)

    # Set text size and plot plot
    fig.update_layout(title_text=title, title_x=0.5)
    file_path_to_save_plot = Path(path_to_store_tsne_dim_reduced_data.parents[0]) / f"{title}.html"
    plotly.offline.plot(fig, filename=str(file_path_to_save_plot))
    fig.show(renderer='browser')


def set_colour(label: str) -> int:
    if "input" in label and "target" in label:
        return 50
    elif label.endswith("input"):
        return 100
    elif label.endswith("target"):
        return 150
    elif label.endswith("preds"):
        return 200
    elif label == "unused_movies":
        return 250
    raise Exception("Label is unknown!")


def set_colour_label(label: str) -> int:
    if "input" in label and "target" in label:
        return "Movie history + Target movie"
    elif label.endswith("input"):
        return "Movie history"
    elif label.endswith("target"):
        return "Target movie"
    elif label.endswith("preds"):
        return "Predicted movie"
    elif label == "unused_movies":
        return "Unused movie"
    raise Exception(f"Label {label} is unknown!")

In [44]:
# Find all relevant model execution paths
relevant_parameterizations = []
relevant_model_execution_paths = []

# Find all relevant model parameterizations
for max_data in max_data_range:
    for train_data_relationship in train_data_relationship_range:
        for history_len in history_len_range:
            for min_history_len in min_history_len_range:
                for number_of_predicted_movies in number_of_predicted_movies_range:
                    for fill_history_len_with_zero_movies in fill_history_len_with_zero_movies_range:
                        for fine_grained_extracting in fine_grained_extracting_range:
                            save_dir = f"{max_data}_{train_data_relationship}_{history_len}_{min_history_len}_{number_of_predicted_movies}_{fill_history_len_with_zero_movies}_{fine_grained_extracting}"
                            relevant_parameterizations.append((save_dir, history_len))

# Search for relevant model executino paths
for root, dirs, files in os.walk(project_dir / "model/results"):  # Iterate over all model execution directories
    for dir in dirs:
        for parameterization, history_len in relevant_parameterizations:
            if dir.startswith(parameterization):  # Check all directories, if it is a relevant model execution
                relevant_model_execution_paths.append((Path(root) / dir, history_len))
                break  # Skip other possible model exuections, use only first one

relevant_model_execution_paths

[(WindowsPath('C:/Users/Padinator/Downloads/Module/Master/Kint/Watch-Tip/model/results/100000_0.85_100_50_10_True_False_50_88901_32'),
  100),
 (WindowsPath('C:/Users/Padinator/Downloads/Module/Master/Kint/Watch-Tip/model/results/100000_0.85_30_20_10_False_False_50_85164_32'),
  30),
 (WindowsPath('C:/Users/Padinator/Downloads/Module/Master/Kint/Watch-Tip/model/results/100000_0.85_30_20_1_False_False_50_85911_32'),
  30),
 (WindowsPath('C:/Users/Padinator/Downloads/Module/Master/Kint/Watch-Tip/model/results/100000_0.85_30_20_2_False_False_50_85439_32'),
  30)]

In [115]:
# Load predictions from file
predictions_with_test_data = [(path, load_object_from_file(path / "predictions.pickle"), history_len) for path, history_len in relevant_model_execution_paths]
print(type(predictions_with_test_data))
print(f"Found {len(predictions_with_test_data)} predictions with test data")

<class 'list'>
Found 1 predictions with test data


In [52]:
# Visualize all users' movie histories for each parameterization
for path, _ in relevant_model_execution_paths:
    df_train_data = pd.read_pickle(path / "train_data.dataframe")
    df_test_data = pd.read_pickle(path / "test_data.dataframe")
    train_dataframe_path = path / "tsne_reduced_data_train.pickle"
    test_dataframe_path = path / "tsne_reduced_data_test.pickle"
    print(f"Visualize data of {path}")
    # model = tf.keras.models.load_model(path / "lstm.keras")
    # print(model.summary())

    # Remove dubplicate points and change labels
    train_data_no_duplicates = dict([(col, []) for col in df_train_data.columns])
    test_data_no_duplicates = dict([(col, []) for col in df_train_data.columns])

    for i, row in df_train_data.iterrows():
        if row["movie"] != -1 and row["movie"] in train_data_no_duplicates["movie"]:  # Movie is already in dict and no prediction
            index = train_data_no_duplicates["movie"].index(row["movie"])
            train_data_no_duplicates["label"][index] += row["label"]
        else:  # Add new movies and predictions to dict
            for col in df_train_data.columns:
                train_data_no_duplicates[col].append(row[col])

    for i, row in df_test_data.iterrows():
        if row["movie"] != -1 and row["movie"] in test_data_no_duplicates["movie"]:  # Movie is already in dict and no prediction
            index = test_data_no_duplicates["movie"].index(row["movie"])
            test_data_no_duplicates["label"][index] += row["label"]
        else:  # Add new movies and predictions to dict
            for col in df_test_data.columns:
                test_data_no_duplicates[col].append(row[col])

    df_train_data = pd.DataFrame(train_data_no_duplicates)
    df_test_data = pd.DataFrame(test_data_no_duplicates)

    # Define colours for all these data sets
    df_train_data[col_colours] = [set_colour(label) for label in df_train_data["label"]]
    df_test_data[col_colours] = [set_colour(label) for label in df_test_data["label"]]

    # Define colour labels
    df_train_data[col_colour_labels] = [set_colour_label(label) for label in df_train_data["label"]]
    df_test_data[col_colour_labels] = [set_colour_label(label) for label in df_test_data["label"]]

    # Visualize DataFrame
    for dataframe_path, df, label in [(train_dataframe_path, df_train_data, "train"), (test_dataframe_path, df_test_data, "test")]:
        print(f"Size of DataFrame to visualize: {df.shape}")
        print(Counter(df["label"].values))
        print(Counter(df[col_colours].values))
        print(Counter(df[col_colour_labels].values))
        visualize_data_with_pca_or_tsne(path_to_store_tsne_dim_reduced_data=dataframe_path, components=3, df_original=df, use_pca=True, title=f"Predictions for {label}", ignore_columns=["movie", "label"])
        print()

Visualize data of C:\Users\Padinator\Downloads\Module\Master\Kint\Watch-Tip\model\results\100000_0.85_100_50_10_True_False_50_88901_32
Size of DataFrame to visualize: (100734, 23)
Counter({'train_data_preds': 88901, 'train_data_targettrain_data_input': 5632, 'train_data_inputtrain_data_target': 5492, 'train_data_input': 700, 'train_data_target': 7, 'unused_movies': 2})
Counter({200: 88901, 50: 11124, 100: 700, 150: 7, 250: 2})
Counter({'Predicted movie': 88901, 'Movie history + Target movie': 11124, 'Movie history': 700, 'Target movie': 7, 'Unused movie': 2})
Variance/Amount of left/remaining information: [0.23616938 0.17052593 0.14007903], lost information: 0.4532256647091115
Variance: [285.2389975  205.95661349 169.18367339]
Colour labels:
 ['Movie history + Target movie', 'Movie history', 'Target movie', 'Predicted movie', 'Unused movie']
Colour ticks:
 [50, 100, 150, 200, 250]

Size of DataFrame to visualize: (27522, 23)
Counter({'test_data_preds': 15689, 'test_data_inputtest_data_