In [102]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import re
import sys
import tensorflow as tf
import time

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from typing import Any, Dict, List, Tuple

# ---------- Import own python files ----------
sys.path.append('../')

import helper.variables as vars

from database.movie import Movies
from database.user import Users
from database.genre import Genres
from helper.file_system_interaction import load_object_from_file, save_object_in_file

In [2]:
# Define constants
HISTORY_LEN = 10
MIN_MOVIE_HISTORY_LEN = 5
DISTANCE_TO_OTHER_MOVIES = 0.1
TRAIN_DATA_RELATIONSHIP = 0.85
SEED = 1234

# Constants for computing the difference between multiple values
EPSILON = 50
INDEPENDENT_MAX_DIFF_PER_GENRE = 5
NUMBER_OF_INTERVALS = 5
# nan_movies = []

In [3]:
# Set seed
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
def find_real_genres_to_all_user_movies(movies: Dict[int, Dict[str, Any]], users: Dict[int, Dict[str, Any]]) -> Dict[int, List[np.array]]:
    """
        Find real genres of users (= watched movies) with real genres of movies.
        Returns a dict of all movies as real genres = numpy arrays.
    """

    user_movie_histories = {}
    i = 0

    for user_id, reviews in users.items():
        reviews = dict(sorted(reviews.items()))  # Sort user reviews by creation date
        user_movie_histories[user_id] = []

        for _, review in reviews.items():
            if i % 1000 == 0:
                print(f"Iteration: {i}")
            movie_id = int(review["movie_id"])
            real_movie_genres = movies[movie_id]["real_genres"]
            user_movie_histories[user_id].append(np.array(real_movie_genres, dtype=np.float64))
            i += 1

    return user_movie_histories


def find_real_genres_to_all_user_movies_for_visualization(movies: Dict[int,
        Dict[str, Any]], users: Dict[int, Dict[str, Any]],
        genres: Dict[int, Dict[str, str]]) -> pd.DataFrame:
    """
        Find real genres of users (= watched movies) with real genres of movies.
        Returns a pandas DataFrame containing all movies with real genres
        watched by users. It's useful for visualizations and analyzations of
        read data.
    """

    # global nan_movies

    genre_names = [genre["name"] for genre in genres.values()]
    user_movie_histories = dict(zip(genre_names + ["username"], [[] for _ in range(len(genre_names) + 1)]))
    i = 0

    for user_id, reviews in users.items():
        reviews = dict(sorted(reviews.items()))  # Sort user reviews by creation date

        for _, review in reviews.items():
            if i % 1000 == 0:
                print(f"Iteration: {i}")
            movie_id = int(review["movie_id"])
            real_movie_genres = movies[movie_id]["real_genres"]

            for j, genre in enumerate(genre_names):
                user_movie_histories[genre].append(real_movie_genres[j])
            user_movie_histories["username"].append(user_id)
            i += 1

            # if np.isnan(np.min(real_movie_genres)):
            #     nan_movies.append(movie_id)

    return pd.DataFrame(user_movie_histories)


def extract_features(user_movie_histories: Dict[int, List[np.array]],
        movie_history_len: int, min_movie_history_len: int=MIN_MOVIE_HISTORY_LEN,
        fill_history_len_with_zero_movies=True) -> List[Tuple[np.array, np.array]]:
    """
        Extract features: partionate user histories into parts with length
        "min_movie_history_len" so that next movie is the predicted target.
        Returns tuples consisting of the last seen movies and the next one
        to predict (= target, label) out ot the previous ones.
    """

    all_extracted_features = []
    skipped_histories, used_histories = 0, 0

    for users_movie_history in user_movie_histories.values():  # Iterate over all users' histories
        if len(users_movie_history) < min_movie_history_len\
                or ((not fill_history_len_with_zero_movies)\
                    and len(users_movie_history) <= movie_history_len):  # User has not enough movies watched
            skipped_histories += 1
            continue
        elif fill_history_len_with_zero_movies\
                and len(users_movie_history) <= movie_history_len:  # Use has watched enoguh movies, but not many
            # Find movies and target/label
            movies = users_movie_history[:-1]
            target_label = users_movie_history[-1]

            # Fill missing movies with zeros
            number_of_missing_movies = movie_history_len - len(movies)
            zero_movie = np.zeros(target_label.shape[0])  # Create movie containing only 0 for all real genres
            zero_movies = list(np.tile(zero_movie, (number_of_missing_movies, 1)))

            # Create one list with zero movies and watched movies of a user
            history_feature = (zero_movies + movies, target_label)
            all_extracted_features.append(history_feature)
        else:  # Use history only, if it is long enough
            all_extracted_features.extend(
                [(np.copy(users_movie_history[i:i+movie_history_len]), users_movie_history[movie_history_len])
                    for i in range(0, len(users_movie_history) - movie_history_len - 1, movie_history_len)]
            )

    used_histories = len(user_movie_histories) - skipped_histories
    print(f"Extracted histories of {used_histories} users")
    print(f"Skipped {skipped_histories} histories, because they have less than "\
          + f"{min_movie_history_len} movies in their history of movies")

    return used_histories, all_extracted_features


def calc_distance(ys_true: np.float64, ys_pred: np.float64, allowed_diff_per_value: float=INDEPENDENT_MAX_DIFF_PER_GENRE,
                  number_of_intervals: float=NUMBER_OF_INTERVALS) -> np.float64:
    """
        Computes distance between the true and the predicted y values.
        For each combination of true an dpredicted y values:\n
        If the true y value is higher, then a higher difference is
        acceptable, else the difference must be lower, e.g.:\n
        y_true = 86; y_pred = 80\n
        -> difference should be a maximum of 8.5\n
        => y_pred is okay\n
        \n
        y_true = 4; y_pred = 10\n
        -> difference should be a maximum of 0.5\n
        => y_pred is not okay\n
        \n
        Differences increase by 0.5 in the following intervals:\n
        Intervals:  [0,5), [5,10), [10,15), [15,20), ...\n
        Differences: 0.5      1      1.5       2     ...\n
        \n
        Returns the sum of all differences being too high.
    """

    overall_diff = 0

    for y_true, y_pred in zip(ys_true, ys_pred):
        diff = abs(y_true - y_pred)
        allowed_diff = (y_pred // number_of_intervals + 1) * allowed_diff_per_value

        if allowed_diff < diff:  # Only add differenes, which are too high
            overall_diff += diff

    return overall_diff


def evaluate_model(y_test: np.array, predictions: np.array) -> float:
    """
        Evaluates a model by comparing true test values with predicted y
        values. Compare each y value will be compared with its corresponding
        prediction value.\n
        Returns the accuracy.
    """

    # Define variables
    distances = []

    # Compute distances of pair of predicted and true y values
    for y, y_pred in zip(y_test, predictions):
        # distance = np.linalg.norm(y - y_pred)  # Euclidean distances between points
        distance = calc_distance(y, y_pred)  # Own distane per genre/value
        distances.append(distance)

    # Output some metrics
    overall_mean_deviation = sum(distances) / len(distances)
    correct_classifications_distances = [dist for dist in distances if dist <= EPSILON]
    false_classifications_distances = [dist for dist in distances if EPSILON < dist]
    mean_deviation_from_correct_classifications = sum(correct_classifications_distances) / len(correct_classifications_distances)
    mean_deviation_from_false_classifications = sum(false_classifications_distances) / len(false_classifications_distances)
    print(f"\nCorrect classifications: {len(correct_classifications_distances)},"\
        + f"false classifications: {len(false_classifications_distances)}, "\
        + f"accuracy: {len(correct_classifications_distances) / len(distances)}")
    print(f"Correct classifications deviations: {mean_deviation_from_correct_classifications}")
    print(f"False classifications deviations: {mean_deviation_from_false_classifications}")
    print(f"Overall mean deviation: {overall_mean_deviation}")

    return len(correct_classifications_distances) / len(distances)

In [5]:
# Read data from database
all_movies = Movies().get_all()
all_users = Users().get_all()
all_genres = Genres().get_all()

In [6]:
# Find real genres to movies, users have watched
user_movie_histories = find_real_genres_to_all_user_movies(all_movies, all_users)
save_object_in_file(vars.user_history_file_path_with_real_genres, user_movie_histories)

Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
Iteration: 5000
Iteration: 6000
Iteration: 7000
Iteration: 8000
Iteration: 9000
Iteration: 10000
Iteration: 11000
Iteration: 12000
Iteration: 13000
Iteration: 14000
Iteration: 15000
Iteration: 16000
Iteration: 17000
Iteration: 18000
Iteration: 19000
Iteration: 20000
Iteration: 21000
Iteration: 22000
Iteration: 23000
Iteration: 24000
Iteration: 25000
Iteration: 26000
Iteration: 27000
Iteration: 28000
Iteration: 29000
Iteration: 30000
Iteration: 31000


In [7]:
# Read data again and create a pandas DataFrame with the 
df_user_movie_histories = find_real_genres_to_all_user_movies_for_visualization(all_movies, all_users, all_genres)
save_object_in_file(vars.user_history_file_path_with_real_genres_visualization, df_user_movie_histories)
# print(nan_movies)

Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
Iteration: 5000
Iteration: 6000
Iteration: 7000
Iteration: 8000
Iteration: 9000
Iteration: 10000
Iteration: 11000
Iteration: 12000
Iteration: 13000
Iteration: 14000
Iteration: 15000
Iteration: 16000
Iteration: 17000
Iteration: 18000
Iteration: 19000
Iteration: 20000
Iteration: 21000
Iteration: 22000
Iteration: 23000
Iteration: 24000
Iteration: 25000
Iteration: 26000
Iteration: 27000
Iteration: 28000
Iteration: 29000
Iteration: 30000
Iteration: 31000


In [116]:
def remove_duplicates(arr: pd.Series) -> List[str]:
    seen = set()
    seen_add = seen.add  # Faster than calling seen.add itself
    return [str(x) for x in arr if not (str(x) in seen or seen_add(str(x)))]


# Analyze principal components with PCA
def plotprincipalcomponents_with_PCA(components: int, df_original: pd.DataFrame, point_size: int=10, title: str="") -> None:
    """
        Computes "components" many principal components for the passed
        DataFrame "df" and plots the results. For this the number of
        components must be 2 for 2D or 3 for 3D.
    """

    assert 2 <= components <= 3

    # Compute principal components
    tmp_df = df_original.loc[:, df_original.columns != "colour_labels"]  # Ignore column colour for computing the principal components
    pca = PCA(n_components=components)
    res = pca.fit_transform(tmp_df.values)
    print(f"Variance/Amount of left/remaining information: {pca.explained_variance_ratio_}, lost information: {1 - sum(pca.explained_variance_ratio_)}")
    df = pd.DataFrame(data=res, columns=[f"c{i}" for i in range(len(res[0]))])
    print(f"Variance: {pca.explained_variance_}")

    # Plot results
    df["size"] = [point_size for _ in range(df.shape[0])]
    # df["colour"] = df_original["colour"] if "colour" in df_original.columns else [i for i in range(df.shape[0])]
    df["colour"] = PCA(n_components=1).fit_transform(tmp_df.values)  # Set colour based on only one component
    df["colour"] = (df["colour"] + abs(df["colour"].min())) / abs(df["colour"].max() - df["colour"].min()) * 255
    # colour_labels = [str(label)[1:-1] for label in df_original["colour_labels"].drop_duplicates()]
    colour_labels = [re.sub("'", "", str(label)[1:-1]) for label in remove_duplicates(df_original["colour_labels"])]
    colours = [(i + 1) / len(colour_labels) * 255 for i in range(len(colour_labels))]
    print("Colour labels:\n", colour_labels)

    if components == 3:
        fig = px.scatter_3d(df, x="c0", y="c1", z="c2", size="size", color="colour")
    else:
        fig = px.scatter(df, x="c0", y="c1", size="size", color="colour")

    fig.update_coloraxes(colorbar_tickvals=colours, colorbar_ticktext=colour_labels)
    fig.update_layout(title_text=title, title_x=0.5)
    fig.show(renderer='browser')

# Analyze all movies with PCA

In [96]:
def find_two_highest_genres(movie_genres: np.array) -> List[int]:
    """
        Finds to passed movie the indices of the two highest genres
        and returns them in a sorted list.
    """

    movie_genres_with_indices = [(genre, i) for i, genre in enumerate(movie_genres)]
    max_val, second_max_val = sorted(movie_genres_with_indices)[-2:]
    return sorted([max_val[1], second_max_val[1]])


# Create DataFrame containing all movies
genre_names = np.array([genre["name"] for genre in all_genres.values()])
df_all_movies_real_genres = pd.DataFrame(dict([(genre_name, [movie["real_genres"][i] for movie in all_movies.values()]) for i, genre_name in enumerate(genre_names)]))
df_all_movies_real_genres["colour_labels"] = [genre_names[find_two_highest_genres(movie)] for i, movie in df_all_movies_real_genres.loc[:, df_all_movies_real_genres.columns != "colour"].iterrows()]
df_all_movies_real_genres["colour_labels"] = [str(label) for label in df_all_movies_real_genres["colour_labels"]]
df_all_movies_real_genres = df_all_movies_real_genres.sort_values("colour_labels")
df_all_movies_real_genres["colour_labels"] = [re.sub("'", "", label[1:-1]).split(",") for label in df_all_movies_real_genres["colour_labels"]]
df_all_movies_real_genres

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,colour_labels
945912,100.000000,93.563224,0.24812,4.339570,2.845786,0.425349,3.825607,1.227941,0.372180,0.053169,1.936856,0.873484,0.265843,0.159506,0.513963,1.015267,3.639095,0.053169,0.248120,[Action Adventure]
543237,98.176640,95.422535,0.00000,30.879371,0.000000,0.000000,11.399248,0.000000,0.779741,0.538452,0.693248,0.098506,0.881062,0.711439,0.098506,0.000000,0.177311,0.098506,0.000000,[Action Adventure]
543233,77.500000,77.500000,0.00000,70.833333,0.000000,5.000000,31.666667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[Action Adventure]
469805,83.333333,77.582535,0.00000,39.190628,4.259851,0.000000,30.830671,0.000000,11.608094,0.000000,6.815761,0.000000,2.555911,23.056443,7.241747,0.000000,6.815761,0.000000,0.000000,[Action Adventure]
821254,90.909091,78.625790,0.00000,12.400179,10.059244,0.000000,24.176957,0.000000,16.934516,0.000000,49.135770,0.045396,3.749330,3.483438,74.927617,0.000000,0.853701,0.000000,52.941114,[Action Adventure]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526799,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western]
608376,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western]
923071,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western]
673686,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western]


In [97]:
# Check data against null values
df_all_movies_real_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 997216 entries, 945912 to 498607
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Action           997216 non-null  float64
 1   Adventure        997216 non-null  float64
 2   Animation        997216 non-null  float64
 3   Comedy           997216 non-null  float64
 4   Crime            997216 non-null  float64
 5   Documentary      997216 non-null  float64
 6   Drama            997216 non-null  float64
 7   Family           997216 non-null  float64
 8   Fantasy          997216 non-null  float64
 9   History          997216 non-null  float64
 10  Horror           997216 non-null  float64
 11  Music            997216 non-null  float64
 12  Mystery          997216 non-null  float64
 13  Romance          997216 non-null  float64
 14  Science Fiction  997216 non-null  float64
 15  TV Movie         997216 non-null  float64
 16  Thriller         997216 non-null  floa

In [98]:
# Output a summary of all data
df_all_movies_real_genres.describe()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
count,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0
mean,8.561787,4.556302,7.550262,24.261722,6.722033,23.063166,39.572234,4.717047,4.206436,3.300081,9.082113,7.457313,3.803111,9.459366,3.906891,4.561328,8.933358,2.00999,1.611904
std,19.593822,12.104175,23.348121,31.604919,14.659521,36.283449,38.772365,13.94602,12.268846,9.817007,22.032252,20.509554,11.357039,18.045068,12.594768,13.999744,18.969869,7.520119,8.667786
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.520833,0.347222,0.0,10.305542,1.0,2.380952,28.796658,0.190885,0.48054,0.269542,0.700432,0.481363,0.258,2.095052,0.122791,0.0,1.346822,0.0,0.0
75%,6.400087,3.857935,1.36329,34.468391,6.932611,31.771764,75.0,3.209255,3.182273,2.957602,5.555556,3.671855,3.054238,10.450283,2.284968,2.315705,8.333333,1.388889,0.128208
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [109]:
# Do a 2D plot
plotprincipalcomponents_with_PCA(2, df_all_movies_real_genres, point_size=30)

Variance/Amount of left/remaining information: [0.28204341 0.13792403]
Variance: [2187.90636077 1069.92347426]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Mystery', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Music', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation History', 'Animation Horror', 'Animation Music',

In [118]:
# Do a 3D plot of a part of all one million movies (plotpy cannot handle all the movies at the same time)
number_of_parts = 5
part_len = df_all_movies_real_genres.shape[0] // number_of_parts

for i in range(number_of_parts):
    title = f"Rows of data: {i * part_len} - {(i + 1) * part_len}"
    print(f"\n{title}")
    plotprincipalcomponents_with_PCA(3, df_all_movies_real_genres.iloc[i * part_len:(i + 1) * part_len], point_size=30, title=title)
    time.sleep(10)
    print()


Data: 0 - 199443
Variance/Amount of left/remaining information: [0.31709074 0.16446088 0.12964409], lost information: 0.38880428806452927
Variance: [2760.1490767  1431.56673209 1128.50039631]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Mystery', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Music', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama', 'Animation Family'

# Results:
You can see blue points in 2D and 3D geometric figures. The white colour origins because of the edge of the points. Many points result in a huge white region. So with this you can see that all movies are in one huge "group". The rease for this is that a movie with genres "Action" and "Sciene Fiction" lays near to a movie with genres "Action" and "Comedy", which lays near to a movie with "Comedy" and "Horror". This means that you can go from one movie to the next, like it's a chain.
Both movie plots are very similiar to a cube. This happens probably during the "linear dimensionality reduction" of PCA.

# Analyze all movies, users have watched, with PCA

In [37]:
# Read movies, user have watched from file
df_user_movie_histories = load_object_from_file(vars.user_history_file_path_with_real_genres_visualization)
df_user_movie_histories = df_user_movie_histories.loc[:, df_user_movie_histories.columns != "username"]  # Ignore column with usernames

In [49]:
# Check data against null values
df_user_movie_histories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31297 entries, 0 to 31296
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Action           31297 non-null  float64
 1   Adventure        31297 non-null  float64
 2   Animation        31297 non-null  float64
 3   Comedy           31297 non-null  float64
 4   Crime            31297 non-null  float64
 5   Documentary      31297 non-null  float64
 6   Drama            31297 non-null  float64
 7   Family           31297 non-null  float64
 8   Fantasy          31297 non-null  float64
 9   History          31297 non-null  float64
 10  Horror           31297 non-null  float64
 11  Music            31297 non-null  float64
 12  Mystery          31297 non-null  float64
 13  Romance          31297 non-null  float64
 14  Science Fiction  31297 non-null  float64
 15  TV Movie         31297 non-null  float64
 16  Thriller         31297 non-null  float64
 17  War         

In [50]:
# Output a summary of all data
df_user_movie_histories.describe()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
count,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0,31297.0
mean,32.646159,22.482875,8.008624,44.852499,20.649398,9.166345,64.163479,10.745926,12.087524,6.502594,21.954064,6.526495,12.500575,20.631506,17.828251,7.53534,34.311569,5.354625,6.150732
std,33.55578,26.664071,20.936755,29.514556,21.385531,18.189337,31.311869,19.144341,18.31299,11.662312,29.52964,11.906326,17.387738,20.366151,24.91223,12.829373,30.407717,10.681122,15.658816
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.163763,5.355105,0.716066,23.601658,7.464606,2.5517,39.703894,2.642358,3.108037,1.970279,4.982686,1.995497,4.180713,8.519193,3.537708,1.958449,12.311912,1.149812,0.373387
50%,17.720703,12.135269,2.063377,35.718801,13.945126,4.403303,63.348782,5.186839,5.891448,3.797496,9.152846,3.646601,7.379273,14.060104,7.941448,4.088314,22.473031,2.540468,1.275279
75%,49.330547,22.701339,4.51032,62.151469,22.066248,7.644124,100.0,8.432858,10.993816,6.167552,17.571093,6.237184,11.175749,22.061036,15.869207,8.139896,50.558817,5.128104,3.331964
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [73]:
# Do a 2D plot
plotprincipalcomponents_with_PCA(2, df_user_movie_histories, genre_names=genre_names, point_size=30)

Variance/Amount of left/remaining information: [0.20759312 0.15768275]
Variance: [1983.12180073 1506.33176241]


In [42]:
# Do a 3D plot
plotprincipalcomponents_with_PCA(3, df_user_movie_histories, genre_names=genre_names, point_size=30)

Variance/Amount of left/remaining information: [0.20759312 0.15768275 0.12378476]
Variance: [1983.12180073 1506.33176241 1182.50669601]


<Figure size 1200x1200 with 0 Axes>

# Results:
This is basically the same result as in the plot of all movies. The only difference is the amount of points/movies, which leads to a different figure in the 3D plot. It looks similiar to a half cube.

In [None]:
# Visualize data
# TODO: Look for mean genres
# TODO: Compare genres
# TODO: Cluster genres
# TODO: View history of movies, the user have watched => progress change of genres
# TODO: Eigene Loss-Funktion definieren