In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import sys
import tensorflow as tf

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from typing import Any, Dict, List, Tuple

# ---------- Import own python files ----------
sys.path.append('../')

import helper.variables as vars

from database.movie import Movies
from database.user import Users
from database.genre import Genres
from helper.file_system_interaction import load_object_from_file, save_object_in_file

In [2]:
# Define constants
HISTORY_LEN = 10
MIN_MOVIE_HISTORY_LEN = 5
DISTANCE_TO_OTHER_MOVIES = 0.1
TRAIN_DATA_RELATIONSHIP = 0.85
SEED = 1234

# Constants for computing the difference between multiple values
EPSILON = 50
INDEPENDENT_MAX_DIFF_PER_GENRE = 5
NUMBER_OF_INTERVALS = 5
# nan_movies = []

In [3]:
# Set seed
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
def find_real_genres_to_all_user_movies(movies: Dict[int, Dict[str, Any]], users: Dict[int, Dict[str, Any]]) -> Dict[int, List[np.array]]:
    """
        Find real genres of users (= watched movies) with real genres of movies.
        Returns a dict of all movies as real genres = numpy arrays.
    """

    user_movie_histories = {}
    i = 0

    for user_id, reviews in users.items():
        reviews = dict(sorted(reviews.items()))  # Sort user reviews by creation date
        user_movie_histories[user_id] = []

        for _, review in reviews.items():
            if i % 1000 == 0:
                print(f"Iteration: {i}")
            movie_id = int(review["movie_id"])
            real_movie_genres = movies[movie_id]["real_genres"]
            user_movie_histories[user_id].append(np.array(real_movie_genres, dtype=np.float64))
            i += 1

    return user_movie_histories


def find_real_genres_to_all_user_movies_for_visualization(movies: Dict[int,
        Dict[str, Any]], users: Dict[int, Dict[str, Any]],
        genres: Dict[int, Dict[str, str]]) -> pd.DataFrame:
    """
        Find real genres of users (= watched movies) with real genres of movies.
        Returns a pandas DataFrame containing all movies with real genres
        watched by users. It's useful for visualizations and analyzations of
        read data.
    """

    # global nan_movies

    genre_names = [genre["name"] for genre in genres.values()]
    user_movie_histories = dict(zip(genre_names + ["username"], [[] for _ in range(len(genre_names) + 1)]))
    i = 0

    for user_id, reviews in users.items():
        reviews = dict(sorted(reviews.items()))  # Sort user reviews by creation date

        for _, review in reviews.items():
            if i % 1000 == 0:
                print(f"Iteration: {i}")
            movie_id = int(review["movie_id"])
            real_movie_genres = movies[movie_id]["real_genres"]

            for j, genre in enumerate(genre_names):
                user_movie_histories[genre].append(real_movie_genres[j])
            user_movie_histories["username"].append(user_id)
            i += 1

            # if np.isnan(np.min(real_movie_genres)):
            #     nan_movies.append(movie_id)

    return pd.DataFrame(user_movie_histories)


def extract_features(user_movie_histories: Dict[int, List[np.array]],
        movie_history_len: int, min_movie_history_len: int=MIN_MOVIE_HISTORY_LEN,
        fill_history_len_with_zero_movies=True) -> List[Tuple[np.array, np.array]]:
    """
        Extract features: partionate user histories into parts with length
        "min_movie_history_len" so that next movie is the predicted target.
        Returns tuples consisting of the last seen movies and the next one
        to predict (= target, label) out ot the previous ones.
    """

    all_extracted_features = []
    skipped_histories, used_histories = 0, 0

    for users_movie_history in user_movie_histories.values():  # Iterate over all users' histories
        if len(users_movie_history) < min_movie_history_len\
                or ((not fill_history_len_with_zero_movies)\
                    and len(users_movie_history) <= movie_history_len):  # User has not enough movies watched
            skipped_histories += 1
            continue
        elif fill_history_len_with_zero_movies\
                and len(users_movie_history) <= movie_history_len:  # Use has watched enoguh movies, but not many
            # Find movies and target/label
            movies = users_movie_history[:-1]
            target_label = users_movie_history[-1]

            # Fill missing movies with zeros
            number_of_missing_movies = movie_history_len - len(movies)
            zero_movie = np.zeros(target_label.shape[0])  # Create movie containing only 0 for all real genres
            zero_movies = list(np.tile(zero_movie, (number_of_missing_movies, 1)))

            # Create one list with zero movies and watched movies of a user
            history_feature = (zero_movies + movies, target_label)
            all_extracted_features.append(history_feature)
        else:  # Use history only, if it is long enough
            all_extracted_features.extend(
                [(np.copy(users_movie_history[i:i+movie_history_len]), users_movie_history[movie_history_len])
                    for i in range(0, len(users_movie_history) - movie_history_len - 1, movie_history_len)]
            )

    used_histories = len(user_movie_histories) - skipped_histories
    print(f"Extracted histories of {used_histories} users")
    print(f"Skipped {skipped_histories} histories, because they have less than "\
          + f"{min_movie_history_len} movies in their history of movies")

    return used_histories, all_extracted_features


def calc_distance(ys_true: np.float64, ys_pred: np.float64, allowed_diff_per_value: float=INDEPENDENT_MAX_DIFF_PER_GENRE,
                  number_of_intervals: float=NUMBER_OF_INTERVALS) -> np.float64:
    """
        Computes distance between the true and the predicted y values.
        For each combination of true an dpredicted y values:\n
        If the true y value is higher, then a higher difference is
        acceptable, else the difference must be lower, e.g.:\n
        y_true = 86; y_pred = 80\n
        -> difference should be a maximum of 8.5\n
        => y_pred is okay\n
        \n
        y_true = 4; y_pred = 10\n
        -> difference should be a maximum of 0.5\n
        => y_pred is not okay\n
        \n
        Differences increase by 0.5 in the following intervals:\n
        Intervals:  [0,5), [5,10), [10,15), [15,20), ...\n
        Differences: 0.5      1      1.5       2     ...\n
        \n
        Returns the sum of all differences being too high.
    """

    overall_diff = 0

    for y_true, y_pred in zip(ys_true, ys_pred):
        diff = abs(y_true - y_pred)
        allowed_diff = (y_pred // number_of_intervals + 1) * allowed_diff_per_value

        if allowed_diff < diff:  # Only add differenes, which are too high
            overall_diff += diff

    return overall_diff


def evaluate_model(y_test: np.array, predictions: np.array) -> float:
    """
        Evaluates a model by comparing true test values with predicted y
        values. Compare each y value will be compared with its corresponding
        prediction value.\n
        Returns the accuracy.
    """

    # Define variables
    distances = []

    # Compute distances of pair of predicted and true y values
    for y, y_pred in zip(y_test, predictions):
        # distance = np.linalg.norm(y - y_pred)  # Euclidean distances between points
        distance = calc_distance(y, y_pred)  # Own distane per genre/value
        distances.append(distance)

    # Output some metrics
    overall_mean_deviation = sum(distances) / len(distances)
    correct_classifications_distances = [dist for dist in distances if dist <= EPSILON]
    false_classifications_distances = [dist for dist in distances if EPSILON < dist]
    mean_deviation_from_correct_classifications = sum(correct_classifications_distances) / len(correct_classifications_distances)
    mean_deviation_from_false_classifications = sum(false_classifications_distances) / len(false_classifications_distances)
    print(f"\nCorrect classifications: {len(correct_classifications_distances)},"\
        + f"false classifications: {len(false_classifications_distances)}, "\
        + f"accuracy: {len(correct_classifications_distances) / len(distances)}")
    print(f"Correct classifications deviations: {mean_deviation_from_correct_classifications}")
    print(f"False classifications deviations: {mean_deviation_from_false_classifications}")
    print(f"Overall mean deviation: {overall_mean_deviation}")

    return len(correct_classifications_distances) / len(distances)

In [5]:
# Read data from database
all_movies = Movies().get_all()
all_users = Users().get_all()
all_genres = Genres().get_all()

In [6]:
# Find real genres to movies, users have watched
user_movie_histories = find_real_genres_to_all_user_movies(all_movies, all_users)
save_object_in_file(vars.user_history_file_path_with_real_genres, user_movie_histories)

Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
Iteration: 5000
Iteration: 6000
Iteration: 7000
Iteration: 8000
Iteration: 9000
Iteration: 10000
Iteration: 11000
Iteration: 12000
Iteration: 13000
Iteration: 14000
Iteration: 15000
Iteration: 16000
Iteration: 17000
Iteration: 18000
Iteration: 19000
Iteration: 20000
Iteration: 21000
Iteration: 22000
Iteration: 23000
Iteration: 24000
Iteration: 25000
Iteration: 26000
Iteration: 27000
Iteration: 28000
Iteration: 29000
Iteration: 30000
Iteration: 31000


In [7]:
# Read data again and create a pandas DataFrame with the 
df_user_movie_histories = find_real_genres_to_all_user_movies_for_visualization(all_movies, all_users, all_genres)
save_object_in_file(vars.user_history_file_path_with_real_genres_visualization, df_user_movie_histories)
# print(nan_movies)

Iteration: 0
Iteration: 1000
Iteration: 2000
Iteration: 3000
Iteration: 4000
Iteration: 5000
Iteration: 6000
Iteration: 7000
Iteration: 8000
Iteration: 9000
Iteration: 10000
Iteration: 11000
Iteration: 12000
Iteration: 13000
Iteration: 14000
Iteration: 15000
Iteration: 16000
Iteration: 17000
Iteration: 18000
Iteration: 19000
Iteration: 20000
Iteration: 21000
Iteration: 22000
Iteration: 23000
Iteration: 24000
Iteration: 25000
Iteration: 26000
Iteration: 27000
Iteration: 28000
Iteration: 29000
Iteration: 30000
Iteration: 31000


In [32]:
# Analyze principal components with PCA
def plotprincipalcomponents_with_PCA(components: int, df_original: pd.DataFrame, point_size: int=10, colour_time_dependence=False) -> None:
    """
        Computes "components" many principal components for the passed
        DataFrame "df" and plots the results. For this the number of
        components must be 2 for 2D or 3 for 3D.
        If "colour_time_dependence" == True, then the brightness of all
        values depends on the time of the ouccurence of a value. An early
        value will be painted with a dark gray and the last value with a
        bright gray. If "colour_time_dependence" == False, then the values
        for the first principal component (x-axis) will be used for
        colouring.
    """

    assert 2 <= components <= 3

    # Compute principal components
    pca = PCA(n_components=components)
    res = pca.fit_transform(df_original.values)
    print(f"Variance/Amount of left/remaining information: {pca.explained_variance_ratio_}")
    df = pd.DataFrame(data=res, columns=[f"c{i}" for i in range(len(res[0]))])
    print(f"Variance: {pca.explained_variance_}")

    # Plot results
    fig = plt.figure(figsize=(12, 12))
    df["size"] = [point_size for _ in range(df.shape[0])]

    if components == 3:
        fig = px.scatter_3d(df, x="c0", y="c1", z="c2", size="size")
    else:
        fig = px.scatter(df, x="c0", y="c1", size="size")
    fig.show(renderer='browser')

# Analyze all movies with PCA

In [16]:
# Create DataFrame containing all movies
genre_names = [genre["name"] for genre in all_genres.values()]
df_all_movies_real_genres = pd.DataFrame(dict([(genre_name, [movie["real_genres"][i] for movie in all_movies.values()]) for i, genre_name in enumerate(genre_names)]))
df_all_movies_real_genres

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,11.471323,10.945891,3.624746,100.000000,18.891632,1.595545,43.974557,30.192369,2.574224,1.958034,3.694293,8.422951,9.908445,23.921803,2.832122,0.402453,8.083231,4.509272,14.059479
1,96.003986,97.446822,11.902378,44.490820,15.805669,19.743201,58.530361,19.019827,14.233797,6.748253,13.105939,7.779188,8.744115,16.950337,84.109544,18.608901,21.259794,10.667174,4.645339
2,12.575415,8.295626,0.000000,0.000000,0.000000,66.666667,2.356712,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,100.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,10.467305,3.389747,3.838966,37.535104,11.052938,4.119213,100.000000,1.877240,12.409426,2.268761,6.123895,5.732374,4.246655,7.600629,3.474234,9.454980,14.336959,2.941353,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997211,2.259036,0.421687,0.210843,100.000000,5.030120,3.232932,15.732932,0.632530,2.319277,0.000000,3.885542,0.210843,1.897590,13.155957,2.259036,11.248327,6.837349,0.000000,0.000000
997212,4.496671,5.939423,0.042379,52.438144,9.880729,54.719781,75.000000,0.702240,1.542384,7.820729,8.982686,7.625362,4.773859,18.894223,2.097616,0.847095,15.704231,1.299999,3.494319
997213,0.000000,0.000000,100.000000,44.284759,0.000000,25.980392,10.773173,0.000000,1.949643,4.289216,1.626560,5.403298,0.000000,6.361408,5.670677,0.000000,0.000000,0.122549,1.381462
997214,0.000000,0.000000,0.000000,0.000000,0.000000,7.017544,66.666667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.263158,0.000000,0.000000


In [19]:
# Do a 2D plot
plotprincipalcomponents_with_PCA(2, df_all_movies_real_genres, point_size=30)

Variance/Amount of left/remaining information: [0.28204341 0.13792403]
Variance: [2187.90636077 1069.92347426]


<Figure size 1200x1200 with 0 Axes>

In [35]:
# Do a 3D plot of the first 500,000 movies (pylpot cannot handle all the movies at the same time)
plotprincipalcomponents_with_PCA(3, df_all_movies_real_genres.iloc[:500000], point_size=30)

Variance/Amount of left/remaining information: [0.28411353 0.13440818 0.10442088]
Variance: [2241.53422043 1060.42302885  823.83610756]


<Figure size 1200x1200 with 0 Axes>

In [36]:
# Do a 3D plot of the last 500,000 movies (pylpot cannot handle all the movies at the same time)
plotprincipalcomponents_with_PCA(3, df_all_movies_real_genres.iloc[500000:], point_size=30)

Variance/Amount of left/remaining information: [0.27486164 0.14962803 0.10653186]
Variance: [2051.9983316  1117.05827137  795.32084733]


<Figure size 1200x1200 with 0 Axes>

# Results:
You can see blue points in 2D and 3D geometric figures. The white colour origins because of the edge of the points. Many points result in a huge white region. So with this you can see that all movies are in one huge "group". The rease for this is that a movie with genres "Action" and "Sciene Fiction" lays near to a movie with genres "Action" and "Comedy", which lays near to a movie with "Comedy" and "Horror". This means that you can go from one movie to the next, like it's a chain.
Both movie plots are very similiar to a cube. This happens probably during the "linear dimensionality reduction" of PCA.

# Analyze all movies, users have watched, with PCA

In [37]:
# Read movies, user have watched from file
df_user_movie_histories = load_object_from_file(vars.user_history_file_path_with_real_genres_visualization)
df_user_movie_histories = df_user_movie_histories.loc[:, df_user_movie_histories.columns != "username"]  # Ignore column with usernames

In [41]:
# Do a 2D plot
plotprincipalcomponents_with_PCA(2, df_user_movie_histories, point_size=30)

Variance/Amount of left/remaining information: [0.20759312 0.15768275]
Variance: [1983.12180073 1506.33176241]


<Figure size 1200x1200 with 0 Axes>

In [42]:
# Do a 3D plot
plotprincipalcomponents_with_PCA(3, df_user_movie_histories, point_size=30)

Variance/Amount of left/remaining information: [0.20759312 0.15768275 0.12378476]
Variance: [1983.12180073 1506.33176241 1182.50669601]


<Figure size 1200x1200 with 0 Axes>

# Results:
This is basically the same result as in the plot of all movies. The only difference is the amount of points/movies, which leads to a different figure in the 3D plot. It looks similiar to a half cube.

In [None]:
# Visualize data
# TODO: Look for mean genres
# TODO: Compare genres
# TODO: Cluster genres
# TODO: View history of movies, the user have watched => progress change of genres
# TODO: Eigene Loss-Funktion definieren