In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import re
import sys
import tensorflow as tf
import time

from collections import Counter
from itertools import combinations
from MulticoreTSNE import MulticoreTSNE as TSNE
from pathlib import Path
from sklearn.decomposition import PCA
from typing import List

# ---------- Import own python modules ----------
project_dir = Path(os.path.abspath("")).parents[0]
sys.path.append(str(project_dir))

import helper.variables as vars

from database.genre import Genres
from helper.file_system_interaction import load_object_from_file, save_object_in_file

In [9]:
# Define constants like seed etc.
SEED = 1234
cpu_kernel_for_tsne = 16
# nan_movies = []

# Define column names for DataFrames
col_colours = "colour"
col_colour_labels = "colour_labels"

# Set seed
np.random.seed(SEED)

# Define variables
max_data_range = [100000]
train_data_relationship_range = [0.85]
history_len_range = [30]
min_history_len_range = [20]
fill_history_len_with_zero_movies_range = [False]
fine_grained_extracting_range = [False]

In [3]:
# Read data from database
all_genres = Genres().get_all()
genre_names = np.array([genre["name"] for genre in all_genres.values()])

In [4]:
def remove_duplicates(arr: pd.Series) -> List[str]:
    seen = set()
    res = []

    for colour_value, colour_label in arr:
        label = str(colour_label)
        if label not in seen:
            res.append((colour_value, label))
            seen.add(label)

    return res


def visualize_data_with_pca_or_tsne(path_to_store_tsne_dim_reduced_data: Path, components: int, df_original: pd.DataFrame, use_pca: bool=True,
                                    time_dependence=False, point_size: int=10, title: str="", ignore_columns: List[str]=[]) -> np.ndarray:
    """
        Computes "components" many principal components for the passed
        DataFrame "df" and plots the results. For this the number of
        components must be 2 for 2D or 3 for 3D.
        If column "colour" and "colour_labels" are already defined,
        then the predfined ones will be used.
        Pass is PCA should be used to compute principal components (True)
        or if t-SNE should be used (False).
    """

    global col_colours, col_colour_labels, cpu_kernel_for_tsne

    assert 2 <= components <= 3

    # Define some variables
    col_size = "size"

    # Compute principal components
    tmp_df = df_original.loc[:, df_original.columns != col_colour_labels]  # Ignore column colour for computing the principal components
    tmp_df = df_original[[col for col in df_original.columns if col not in [col_colours, col_colour_labels] + ignore_columns]]

    if use_pca:
        model = PCA(n_components=components, random_state=SEED)
    else:
        model = TSNE(n_components=components, n_jobs=cpu_kernel_for_tsne, random_state=SEED)  # Initialization "random" only supported

    if not use_pca and os.path.exists(path_to_store_tsne_dim_reduced_data):  # Load t-SNE dimension reduced data from file
        res = load_object_from_file(path_to_store_tsne_dim_reduced_data)
    else:  # Compute t-SNE dimension reduced data from file
        res = model.fit_transform(tmp_df.values)

        if not use_pca:  # Save only with t-SNE
            save_object_in_file(path_to_store_tsne_dim_reduced_data, res)  # Save dimension reduced data in file
    df = pd.DataFrame(data=res, columns=[f"c{i}" for i in range(len(res[0]))])

    if col_colours in df_original.columns:  # Use colours from original DataFrame
        df[col_colours] = df_original[col_colours].values  # Use colours from original DataFrame

    if use_pca:
        print(f"Variance/Amount of left/remaining information: {model.explained_variance_ratio_}, lost information: {1 - sum(model.explained_variance_ratio_)}")
        print(f"Variance: {model.explained_variance_}")

    # Find all labels (= genres) for different colours (no duplicates)
    colour_labels_and_colour_values = remove_duplicates(df_original[[col_colours, col_colour_labels]].values)  # Remove duplicates
    colour_labels_and_colour_values = sorted(colour_labels_and_colour_values, key=lambda x: x[0])  # Sort labels by colour value
    colour_labels = [label for _, label in colour_labels_and_colour_values]
    colour_labels_str = [str(colour_label) for colour_label in colour_labels]
    colours_ticks = [colour_value for colour_value, _ in colour_labels_and_colour_values]
    print("Colour labels:\n", colour_labels)
    print("Colour ticks:\n", colours_ticks)

    # Set size and colour for each points
    df[col_size] = [point_size for _ in range(df.shape[0])]  # Add size of each data point

    if col_colours not in df.columns:  # Only add colours if it is not predefined
        map_colour_labels_to_colours = dict(zip(colour_labels_str, colours))
        df[col_colours] = [map_colour_labels_to_colours[str(colour_label)] for colour_label in df_original[col_colour_labels]]

    # Plot graph
    if components == 3:
        fig = px.scatter_3d(df, x="c0", y="c1", z="c2", size=col_size, color=col_colours)
    else:
        if time_dependence:  # Plot 3D with x-axis as time
            df["time"] = [i for i in range(1, df.shape[0] + 1)]
            fig = px.scatter_3d(df, x="time", y="c0", z="c1", size=col_size, color=col_colours)
        else:
            fig = px.scatter(df, x="c0", y="c1", size=col_size, color=col_colours)

    fig.update_coloraxes(colorbar_tickvals=colours_ticks, colorbar_ticktext=colour_labels)
    fig.update_layout(title_text=title, title_x=0.5)
    fig.show(renderer='browser')


def set_colour(label: str) -> int:
    if "input" in label and "target" in label:
        return 50
    elif label.endswith("input"):
        return 100
    elif label.endswith("target"):
        return 150
    elif label.endswith("preds"):
        return 200
    elif label == "unused_movies":
        return 250
    raise Exception("Label is unknown!")


def set_colour_label(label: str) -> int:
    if "input" in label and "target" in label:
        return "Movie history + Target movie"
    elif label.endswith("input"):
        return "Movie history"
    elif label.endswith("target"):
        return "Target movie"
    elif label.endswith("preds"):
        return "Predicted movie"
    elif label == "unused_movies":
        return "Unused movie"
    raise Exception(f"Label {label} is unknown!")

In [14]:
# Find all relevant model execution paths
relevant_parameterizations = []
relevant_model_execution_paths = []

# Find all relevant model parameterizations
for max_data in max_data_range:
    for train_data_relationship in train_data_relationship_range:
        for history_len in history_len_range:
            for min_history_len in min_history_len_range:
                for fill_history_len_with_zero_movies in fill_history_len_with_zero_movies_range:
                    for fine_grained_extracting in fine_grained_extracting_range:
                        save_dir = f"{max_data}_{train_data_relationship}_{history_len}_{min_history_len}_{fill_history_len_with_zero_movies}_{fine_grained_extracting}"
                        relevant_parameterizations.append((save_dir, history_len))

# Search for relevant model executino paths
for root, dirs, files in os.walk(project_dir / "model/results"):  # Iterate over all model execution directories
    for dir in dirs:
        for parameterization, history_len in relevant_parameterizations:
            if dir.startswith(parameterization):  # Check all directories, if it is a relevant model execution
                relevant_model_execution_paths.append((Path(root) / dir, history_len))
                break  # Skip other possible model exuections, use only first one

relevant_model_execution_paths

[(WindowsPath('C:/Users/Padinator/Downloads/Module/Master/Kint/Watch-Tip/model/results/100000_0.85_30_20_False_False_50_89127_32'),
  30)]

In [115]:
# Load predictions from file
predictions_with_test_data = [(path, load_object_from_file(path / "predictions.pickle"), history_len) for path, history_len in relevant_model_execution_paths]
print(type(predictions_with_test_data))
print(f"Found {len(predictions_with_test_data)} predictions with test data")

<class 'list'>
Found 1 predictions with test data


In [97]:
# # Visualize all users' movie histories for each parameterization
# for path, (X_test, y_test, predictions), history_len in predictions_with_test_data:
#     dataframe_path = path / "tsne_reduced_data.pickle"
#     print(f"Visualize data of {path}")

#     # handable_data_len = 30000 // history_len  # Use only handable part of data
#     handable_data_len = len(X_test)
#     X_test = X_test[:handable_data_len]
#     y_test = y_test[:handable_data_len]
#     predictions = predictions[:handable_data_len]
#     print(f"Lengths of data (X_test, y_test, predictions): {len(X_test)}, {len(y_test)}, {len(predictions)}")

#     # Create DataFrames with test data (X and y) and predictions
#     df_histories = pd.DataFrame(dict([(genre_name, [movie[i] for movies in X_test for movie in movies]) for i, genre_name in enumerate(genre_names)]))
#     df_targets = pd.DataFrame(dict([(genre_name, [movie[i] for movie in y_test]) for i, genre_name in enumerate(genre_names)]))
#     df_predictions = pd.DataFrame(dict([(genre_name, [movie[i] for movie in predictions]) for i, genre_name in enumerate(genre_names)]))

#     # Define colours for all these data sets
#     df_histories[col_colours] = [50] * df_histories.shape[0]
#     df_targets[col_colours] = [100] * df_targets.shape[0]
#     df_predictions[col_colours] = [150] * df_predictions.shape[0]

#     # Define colour labels
#     df_histories[col_colour_labels] = ["Movie history"] * df_histories.shape[0]
#     df_targets[col_colour_labels] = ["Target movie"] * df_targets.shape[0]
#     df_predictions[col_colour_labels] = ["Predicted movie"] * df_predictions.shape[0]

#     # Merge DataFrames to one
#     df = pd.concat([df_histories, df_targets, df_predictions])
#     df.shape[0]

#     # Visualize DataFrame
#     print(f"Size of DataFrame to visualize: {df.shape}")
#     visualize_data_with_pca_or_tsne(path_to_store_tsne_dim_reduced_data=dataframe_path, components=3, df_original=df, use_pca=False, title=f"Predictions of {path}")
#     print()

In [16]:
# Visualize all users' movie histories for each parameterization
for path, _ in relevant_model_execution_paths:
    df_train_data = pd.read_pickle(path / "train_data.dataframe")
    df_test_data = pd.read_pickle(path / "test_data.dataframe")
    train_dataframe_path = path / "tsne_reduced_data_train.pickle"
    test_dataframe_path = path / "tsne_reduced_data_test.pickle"
    print(f"Visualize data of {path}")
    model = tf.keras.models.load_model(path / "lstm.keras")
    print(model.summary())
    # print(df_train_data)

    # Remove dubplicate points and change labels
    train_data_no_duplicates = dict([(col, []) for col in df_train_data.columns])
    test_data_no_duplicates = dict([(col, []) for col in df_train_data.columns])

    for i, row in df_train_data.iterrows():
        if row["movie"] != -1 and row["movie"] in train_data_no_duplicates["movie"]:  # Movie is already in dict and no prediction
            index = train_data_no_duplicates["movie"].index(row["movie"])
            train_data_no_duplicates["label"][index] += row["label"]
        else:  # Add new movies and predictions to dict
            for col in df_train_data.columns:
                train_data_no_duplicates[col].append(row[col])

    for i, row in df_test_data.iterrows():
        if row["movie"] != -1 and row["movie"] in test_data_no_duplicates["movie"]:  # Movie is already in dict and no prediction
            index = test_data_no_duplicates["movie"].index(row["movie"])
            test_data_no_duplicates["label"][index] += row["label"]
        else:  # Add new movies and predictions to dict
            for col in df_test_data.columns:
                test_data_no_duplicates[col].append(row[col])

    df_train_data = pd.DataFrame(train_data_no_duplicates)
    df_test_data = pd.DataFrame(test_data_no_duplicates)

    # Define colours for all these data sets
    df_train_data[col_colours] = [set_colour(label) for label in df_train_data["label"]]
    df_test_data[col_colours] = [set_colour(label) for label in df_test_data["label"]]

    # Define colour labels
    df_train_data[col_colour_labels] = [set_colour_label(label) for label in df_train_data["label"]]
    df_test_data[col_colour_labels] = [set_colour_label(label) for label in df_test_data["label"]]

    # Visualize DataFrame
    for dataframe_path, df, label in [(train_dataframe_path, df_train_data, "train"), (test_dataframe_path, df_test_data, "test")]:
        print(f"Size of DataFrame to visualize: {df.shape}")
        print(Counter(df["label"].values))
        print(Counter(df[col_colours].values))
        print(Counter(df[col_colour_labels].values))
        visualize_data_with_pca_or_tsne(path_to_store_tsne_dim_reduced_data=dataframe_path, components=3, df_original=df, use_pca=True, title=f"Predictions of {path} for {label}", ignore_columns=["movie", "label"])
        print()

Visualize data of C:\Users\Padinator\Downloads\Module\Master\Kint\Watch-Tip\model\results\100000_0.85_30_20_False_False_50_89127_32
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 30, 32)            1856      
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 19)                1235      
                                                                 
Total params: 27,923
Trainable params: 27,923
Non-trainable params: 0
_________________________________________________________________
None
Size of DataFrame to visualize: (100960, 23)
Counter({'train_data_preds': 89127, 'train_data_targettrain_data_input': 5012, 'train_data_inputtrain_data_target': 4949, 'train_data_

# Results:
- For train and test data: overall no movie is predicted very well and most movies are not predicted well at all
- The two firgures look like two different shapes, one for input and output/target data and one for the predictions
- It is a mystery why Tensorflow says that the model has an accuracy of 45%. There are two different shapes for the train data as well ...