In [74]:
import numpy as np
import pandas as pd
import os
import plotly
import plotly.express as px
import re
import sys
import time

from MulticoreTSNE import MulticoreTSNE as TSNE
from pathlib import Path
from sklearn.decomposition import PCA
from typing import List

# ---------- Import own python modules ----------
project_dir = Path(os.path.abspath("")).parents[0]
sys.path.append(str(project_dir))

import helper.variables as vars

from database.movie import Movies
from database.user import Users
from database.genre import Genres
from helper.file_system_interaction import load_object_from_file

In [2]:
# Define constants like seed etc.
SEED = 1234
cpu_kernel_for_tsne = 16
# nan_movies = []

# Define column names for DataFrames
col_colours = "colour"
col_colour_labels = "colour_labels"

# Set seed
np.random.seed(SEED)

In [3]:
# Read data from database
all_movies = Movies().get_all()
all_users = Users().get_all()
all_genres = Genres().get_all()
genre_names = np.array([genre["name"] for genre in all_genres.values()])

In [1]:
def remove_duplicates(arr: pd.Series) -> List[str]:
    seen = set()
    seen_add = seen.add  # Faster than calling seen.add itself
    return [str(x) for x in arr if not (str(x) in seen or seen_add(str(x)))]


def visualize_data_with_pca_or_tsne(components: int, df_original: pd.DataFrame, use_pca: bool=True,
                                    time_dependence=False, point_size: int=10, title: str="", ignore_columns: List[str]=[]) -> None:
    """
        Computes "components" many principal components for the passed
        DataFrame "df" and plots the results. For this the number of
        components must be 2 for 2D or 3 for 3D.
        If column "colour" and "colour_labels" are already defined,
        then the predfined ones will be used.
        Pass is PCA should be used to compute principal components (True)
        or if t-SNE should be used (False).
    """

    global col_colours, col_colour_labels, cpu_kernel_for_tsne

    assert 2 <= components <= 3

    # Define some variables
    col_size = "size"
    custom_data = ["movie"] if "movie" in df_original.columns else []  # For labeling points with movie names

    # Compute principal components
    tmp_df = df_original.loc[:, df_original.columns != col_colour_labels]  # Ignore column colour for computing the principal components
    tmp_df = df_original[[col for col in df_original.columns if col not in [col_colours, col_colour_labels] + ignore_columns]]

    if use_pca:
        model = PCA(n_components=components, random_state=SEED)
    else:
        model = TSNE(n_components=components, n_jobs=cpu_kernel_for_tsne, random_state=SEED)  # Initialization "random" only supported

    res = model.fit_transform(tmp_df.values)
    df = pd.DataFrame(data=res, columns=[f"c{i}" for i in range(len(res[0]))])

    if col_colours in df_original.columns:  # Use colours from original DataFrame
        df[col_colours] = df_original[col_colours].values

    if use_pca:
        print(f"Variance/Amount of left/remaining information: {model.explained_variance_ratio_}, lost information: {1 - sum(model.explained_variance_ratio_)}")
        print(f"Variance: {model.explained_variance_}")

    # Find all labels (= genres) for different colours (no duplicates)
    colour_labels = [re.sub("'", "", str(label)[1:-1]) for label in remove_duplicates(df_original[col_colour_labels])]  # Remove duplicates
    colour_labels_str = [str(colour_label) for colour_label in colour_labels]
    colours = [(i + 1) / len(colour_labels) * 255 for i in range(len(colour_labels))]
    map_colour_labels_to_colours = dict(zip(colour_labels_str, colours))
    print("Colour labels:\n", colour_labels)

    # Set size and colour for each points
    df[col_size] = [point_size for _ in range(df.shape[0])]  # Add size of each data point

    if col_colours not in df.columns:  # Only add colours if it is not predfined
        df[col_colours] = [map_colour_labels_to_colours[str(colour_label)[2:-2]] for colour_label in df_original[col_colour_labels]]

    # Add names of movies to dimension reduced data
    if "movie" in df_original.columns:
        df["movie"] = df_original["movie"].values

    # Plot graph
    if components == 3:
        fig = px.scatter_3d(df, x="c0", y="c1", z="c2", size=col_size, color=col_colours, custom_data=custom_data)
    else:
        if time_dependence:  # Plot 3D with x-axis as time
            df["time"] = [i for i in range(1, df.shape[0] + 1)]
            fig = px.scatter_3d(df, x="time", y="c0", z="c1", size=col_size, color=col_colours, custom_data=custom_data)
        else:
            fig = px.scatter(df, x="c0", y="c1", size=col_size, color=col_colours, custom_data=custom_data)

    # Set hover for points
    if "movie" in df_original.columns:
        fig.update_traces(
            hovertemplate="<br>".join([
                "%{customdata}",
            ])
        )

    # Set colour bar
    fig.update_coloraxes(colorbar_tickvals=colours, colorbar_ticktext=colour_labels)

    # Set text size and plot plot
    fig.update_layout(title_text=title, title_x=0.5)
    print(f"data_visualization_sources/{title}.html")
    plotly.offline.plot(fig, filename=f"data_visualization_sources/{title}.html")
    fig.show(renderer='browser')


def find_two_highest_genres(movie_genres: np.array, max_genres_per_colour_label: int=2) -> List[int]:
    """
        Finds to passed movie the indices of the two highest genres
        and returns them in a sorted list.
    """

    movie_genres_with_indices = [(genre, i) for i, genre in enumerate(movie_genres)]
    max_val, second_max_val = sorted(movie_genres_with_indices)[-max_genres_per_colour_label:]
    return sorted([max_val[1], second_max_val[1]])


def add_colour_labels_to_df_and_sort_by_colour_labels(df_original: pd.DataFrame, genre_names: List[str],
        sort_by_colour: bool=True, time_dependence: bool=False, max_genres_per_colour_label: int=2) -> pd.DataFrame:
    """
        Adds column "colour_labels" to a copy of passed DataFrame and sorts by
        this column the DataFrame. The sorting will be done alphabetically.
        The oclumn "colour_labels" contains only two genres with the highest
        probability/values of real genres of a movie.
    """

    global col_colour_labels

    df = df_original.copy()

    if time_dependence:  # Sort after time dependence = like occurence of rows in DataFrame = no sorting
        df[col_colour_labels] = list(range(0, df.shape[0]))
    else:  # Add colour labels based on "max_genres_per_colour_label" highest/likeliest genre names
        df[col_colour_labels] = [str(genre_names[find_two_highest_genres(movie,
            max_genres_per_colour_label=max_genres_per_colour_label)]) for _, movie in df.iterrows()]  # Find two highest/likeliest genres
        
        if sort_by_colour:
            df = df.sort_values(col_colour_labels)  # Sort with str by alphabet

        df[col_colour_labels] = [re.sub("'", "", label[1:-1]).split(",") for label in df[col_colour_labels]]  # Use again colour labels as list

    return df

NameError: name 'pd' is not defined

# Analyze all movies with PCA

In [7]:
# Create DataFrame containing all movies and add colour labels
df_all_movies_real_genres = pd.DataFrame(dict([(genre_name, [movie["real_genres"][i] for movie in all_movies.values()]) for i, genre_name in enumerate(genre_names)]))
df_all_movies_real_genres = add_colour_labels_to_df_and_sort_by_colour_labels(df_all_movies_real_genres, genre_names)  # Add colour labels for visualizations
df_all_movies_real_genres["movie"] = [movie["title"] for movie in all_movies.values()]
df_all_movies_real_genres

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,...,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,colour_labels,movie
945912,100.000000,93.563224,0.24812,4.339570,2.845786,0.425349,3.825607,1.227941,0.372180,0.053169,...,0.873484,0.265843,0.159506,0.513963,1.015267,3.639095,0.053169,0.248120,[Action Adventure],Blondie
543237,98.176640,95.422535,0.00000,30.879371,0.000000,0.000000,11.399248,0.000000,0.779741,0.538452,...,0.098506,0.881062,0.711439,0.098506,0.000000,0.177311,0.098506,0.000000,[Action Adventure],Star Wars
543233,77.500000,77.500000,0.00000,70.833333,0.000000,5.000000,31.666667,0.000000,0.000000,0.000000,...,0.000000,0.000000,5.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[Action Adventure],New World Disorder 9: Never Enough
469805,83.333333,77.582535,0.00000,39.190628,4.259851,0.000000,30.830671,0.000000,11.608094,0.000000,...,0.000000,2.555911,23.056443,7.241747,0.000000,6.815761,0.000000,0.000000,[Action Adventure],Sesame Street: Elmo Loves You!
821254,90.909091,78.625790,0.00000,12.400179,10.059244,0.000000,24.176957,0.000000,16.934516,0.000000,...,0.045396,3.749330,3.483438,74.927617,0.000000,0.853701,0.000000,52.941114,[Action Adventure],Sunday in August
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
526799,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western],Wave Babes
608376,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western],Barefoot in the Kitchen
923071,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western],"Lucas, the Ear of Corn"
673686,0.000000,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,[War Western],Avant Garde


In [11]:
# Check data against null values
df_all_movies_real_genres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 997216 entries, 945912 to 498607
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Action           997216 non-null  float64
 1   Adventure        997216 non-null  float64
 2   Animation        997216 non-null  float64
 3   Comedy           997216 non-null  float64
 4   Crime            997216 non-null  float64
 5   Documentary      997216 non-null  float64
 6   Drama            997216 non-null  float64
 7   Family           997216 non-null  float64
 8   Fantasy          997216 non-null  float64
 9   History          997216 non-null  float64
 10  Horror           997216 non-null  float64
 11  Music            997216 non-null  float64
 12  Mystery          997216 non-null  float64
 13  Romance          997216 non-null  float64
 14  Science Fiction  997216 non-null  float64
 15  TV Movie         997216 non-null  float64
 16  Thriller         997216 non-null  floa

In [12]:
# Output a summary of all data
df_all_movies_real_genres.describe()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
count,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0,997216.0
mean,8.561787,4.556302,7.550262,24.261722,6.722033,23.063166,39.572234,4.717047,4.206436,3.300081,9.082113,7.457313,3.803111,9.459366,3.906891,4.561328,8.933358,2.00999,1.611904
std,19.593822,12.104175,23.348121,31.604919,14.659521,36.283449,38.772365,13.94602,12.268846,9.817007,22.032252,20.509554,11.357039,18.045068,12.594768,13.999744,18.969869,7.520119,8.667786
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.520833,0.347222,0.0,10.305542,1.0,2.380952,28.796658,0.190885,0.48054,0.269542,0.700432,0.481363,0.258,2.095052,0.122791,0.0,1.346822,0.0,0.0
75%,6.400087,3.857935,1.36329,34.468391,6.932611,31.771764,75.0,3.209255,3.182273,2.957602,5.555556,3.671855,3.054238,10.450283,2.284968,2.315705,8.333333,1.388889,0.128208
max,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [81]:
# Do a 2D plot
visualize_data_with_pca_or_tsne(2, df_all_movies_real_genres, use_pca=True, point_size=30, title="PCA (2D) with all movies", ignore_columns=["movie"])

Variance/Amount of left/remaining information: [0.28204341 0.13792403], lost information: 0.5800325636677031
Variance: [2187.90636077 1069.92347426]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Mystery', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Music', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation History', 

In [86]:
# Do a 3D plot of a part of all one million movies (plotpy cannot handle all the movies at the same time)
number_of_parts = 5
part_len = df_all_movies_real_genres.shape[0] // number_of_parts

# Plot data in parts
for i in range(number_of_parts):
    title = f"PCA (3D) with rows of data {i * part_len} - {(i + 1) * part_len}"
    print(f"\n{title}")
    visualize_data_with_pca_or_tsne(3, df_all_movies_real_genres.iloc[i * part_len:(i + 1) * part_len], use_pca=True, point_size=30, title=title, ignore_columns=["movie"])
    print()
    time.sleep(10)

# Plot from each part a little bit
title = f"From each part a little bit"
print(f"\n{title}")
visualize_data_with_pca_or_tsne(3, df_all_movies_real_genres.iloc[::number_of_parts], use_pca=True, point_size=30, title=title, ignore_columns=["movie"])
print()


PCA (3D) with rows of data 0 - 199443
Variance/Amount of left/remaining information: [0.31709074 0.16446088 0.12964409], lost information: 0.38880428806452927
Variance: [2760.1490767  1431.56673209 1128.50039631]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Mystery', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Music', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama

# Results:
- You can see blue points in 2D and 3D geometric figures. The white colour origins because of the edge of the points. Many points result in a huge white region. So with this you can see that all movies are in one huge "group". The rease for this is that a movie with genres "Action" and "Sciene Fiction" lays near to a movie with genres "Action" and "Comedy", which lays near to a movie with "Comedy" and "Horror". This means that you can go from one movie to the next, like it's a chain.
- The first movie plot (2D) is very similiar to a rhombus, but it cannot display all the data very good, so some points are laying on top of others.
- The 3D plots have all some squared areas and some of them are ordered so taht it's similiar to a cube.
- We can also see (in the outputs of the boards) thaht during "linear dimensionality reduction" of PCA information left/gets lost. In 2D its round about 58 %, in the 3D plots its between 27 % and 47 %. The information losses are less than the ones of the 2D plot, because of remaining another dimension.

# Analyze all movies, users have watched, with PCA

In [92]:
# Read movies, user have watched from file and add colour labels
# df_user_movies = load_object_from_file(vars.user_history_file_path_with_real_genres_visualization)  # TMDB
# df_user_movies = load_object_from_file(vars.user_watchings_file_path_with_real_genres_visualization)  # Netflix prize data (full, too big to load)
df_user_movies = load_object_from_file(vars.user_watchings_file_path_with_real_genres_visualization_small)  # Netflix prize data (small)
df_user_movies = df_user_movies.loc[:, df_user_movies.columns != "username"]  # Ignore column with usernames
df_user_movies = df_user_movies.loc[:, df_user_movies.columns != "movie"]  # Ignore column with movie ID
df_user_movies = add_colour_labels_to_df_and_sort_by_colour_labels(df_user_movies, genre_names)  # Add colour labels for visualizations
df_user_movies

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,colour_labels
24166,88.412859,72.520457,2.442380,58.186347,24.545112,4.058699,65.541663,11.069975,9.758126,4.531348,20.717487,6.463598,12.999240,27.090508,14.331489,14.923432,36.694993,4.769316,64.886372,[Action Adventure]
3414,98.874832,93.083561,16.073200,40.608200,13.052851,4.370726,37.397510,28.950497,74.382950,7.557749,10.371629,3.957519,8.224235,11.818679,17.320446,3.467356,29.672616,5.332105,2.401380,[Action Adventure]
28177,97.596154,98.523831,16.791115,22.422648,5.598409,20.745390,36.219231,21.843914,12.247613,2.929119,5.665737,4.319722,4.066552,6.699539,89.264013,19.292146,14.401689,5.502297,1.062640,[Action Adventure]
4851,97.596154,98.523831,16.791115,22.422648,5.598409,20.745390,36.219231,21.843914,12.247613,2.929119,5.665737,4.319722,4.066552,6.699539,89.264013,19.292146,14.401689,5.502297,1.062640,[Action Adventure]
4849,95.688381,82.173198,3.984529,48.356159,21.338018,6.179243,58.734109,6.945279,70.286888,7.040323,14.776278,6.356652,10.355820,23.733555,12.080304,6.280200,32.553723,5.516916,6.543107,[Action Adventure]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23318,66.192932,13.009391,2.040721,28.675340,22.728118,5.554239,66.732940,10.126622,6.536333,3.614497,16.091804,4.412566,21.511331,24.418962,16.016172,99.459035,79.039600,1.404359,1.226785,[TV Movie Thriller]
25580,18.954419,10.500593,6.547468,21.736262,19.781761,3.277831,59.334658,8.950809,6.458297,4.010793,14.974671,1.996348,56.093551,55.781803,6.465052,89.942575,79.431276,0.884153,0.716388,[TV Movie Thriller]
26168,63.014894,11.587945,2.743515,24.501114,18.799460,4.097277,64.771410,16.597573,12.098330,7.518788,14.772820,4.785059,10.636574,16.450514,48.395000,81.287150,72.232485,2.850619,6.945352,[TV Movie Thriller]
26137,50.132375,8.037957,0.245586,23.615922,17.638267,2.642931,56.060998,6.898118,4.352125,1.917981,38.247858,3.272582,44.195486,20.072922,39.865225,99.610779,58.633760,1.766156,1.723706,[TV Movie Thriller]


In [93]:
# Check data against null values
df_user_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30000 entries, 24166 to 16605
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Action           30000 non-null  float64
 1   Adventure        30000 non-null  float64
 2   Animation        30000 non-null  float64
 3   Comedy           30000 non-null  float64
 4   Crime            30000 non-null  float64
 5   Documentary      30000 non-null  float64
 6   Drama            30000 non-null  float64
 7   Family           30000 non-null  float64
 8   Fantasy          30000 non-null  float64
 9   History          30000 non-null  float64
 10  Horror           30000 non-null  float64
 11  Music            30000 non-null  float64
 12  Mystery          30000 non-null  float64
 13  Romance          30000 non-null  float64
 14  Science Fiction  30000 non-null  float64
 15  TV Movie         30000 non-null  float64
 16  Thriller         30000 non-null  float64
 17  War          

In [94]:
# Output a summary of all data
df_user_movies.describe()

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,28.620482,19.421938,5.51687,52.700754,24.109134,8.902629,78.240866,9.968514,10.564521,7.971093,11.987336,7.17762,11.976622,28.521105,13.321526,8.979962,33.024064,6.43137,4.327995
std,28.472372,21.834131,14.816,29.819715,21.816067,16.13118,26.73544,14.857977,14.722149,13.210133,15.955308,12.067133,15.215159,22.285045,18.113753,8.99871,27.144867,12.244515,9.394751
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.944903,7.154696,1.322042,29.555952,10.908053,3.436635,56.159303,3.9574,4.073888,2.726257,5.166183,2.822819,5.227564,14.130649,4.331505,4.414716,14.699659,1.811677,0.831865
50%,18.097222,12.066722,2.422386,42.551514,16.609752,4.978981,96.595877,6.29524,6.450419,4.203611,7.909453,4.133648,7.78138,19.135758,7.868509,7.186019,22.523816,2.959781,1.964569
75%,30.816242,18.508686,4.125858,85.729909,23.83821,7.67321,100.0,9.467002,9.799476,6.512116,11.699991,6.454484,10.60718,32.37928,12.746033,10.998059,40.77999,5.000138,3.944812
max,100.0,100.0,100.0,100.0,98.363424,100.0,100.0,100.0,97.113277,86.060606,100.0,100.0,97.355449,100.0,100.0,100.0,100.0,89.40242,100.0


In [90]:
# Do a 2D plot
visualize_data_with_pca_or_tsne(2, df_user_movies, use_pca=True, point_size=30, title="PCA (2D) of movies, users have watched")

Variance/Amount of left/remaining information: [0.20832717 0.15741902], lost information: 0.6342538104972241
Variance: [1987.46791132 1501.79769893]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation History', 'Animation Horror', 'Animation Music'

In [91]:
# Do a 3D plot
visualize_data_with_pca_or_tsne(3, df_user_movies, use_pca=True, point_size=30, title="PCA (3D) of movies, users have watched")

Variance/Amount of left/remaining information: [0.20832717 0.15741902 0.12388654], lost information: 0.5103672750330814
Variance: [1987.46791132 1501.79769893 1181.89345556]
Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Crime', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Horror', 'Adventure Mystery', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Crime', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation History', 'Animation H

# Results:
- This is basically the same result as in the plot of all movies. The only difference is the amount of points/movies, which leads to a different figure in the 3D plot. The figures consists of some square ares, which cuts nad overlapps each other.
- It's again visible that the most movies belong to the genres "Action", "Adventure" or "Comedy".
The information loss is for the 2D plot 63 % and for the 3D plot 51 %.

# Analyze movie histories of each user with PCA

In [105]:
# Read movies, user have watched from file and add colour labels
# df_user_movie_histories = load_object_from_file(vars.user_history_file_path_with_real_genres)  # TMDB
df_user_movie_histories = load_object_from_file(vars.user_watchings_file_path_with_real_genres_small)  # Netflix prize data (small)
df_user_movie_histories = dict([(user_id, [real_genres for movie_id, real_genres in movies]) for user_id, movies in df_user_movie_histories.items()])
# df_user_movie_histories

In [109]:
# Find users with longest movie histories
max_users_to_show = 10
users_with_longest_movies_histories = dict(sorted(df_user_movie_histories.items(),
                                                  key=lambda item: len(item[1]))[-max_users_to_show:])  # Sort by length of movie history
users_with_longest_movies_histories

{931626: [array([ 20.08616467,  15.91780166,   2.49332836,  37.80365904,
          14.84021188,   3.58699618, 100.        ,   7.03584355,
          11.68149325,   6.33422585,  11.35353367,   3.61798753,
          11.13756206,  17.07396205,  13.76531429,   4.54443017,
          26.29574856,   2.30231183,   1.79602273]),
  array([100.        ,  89.06009764,   1.82349771,  43.28661734,
          19.64976933,   3.38082683,  53.33853256,   5.99929319,
          76.07525529,   5.20119251,   9.80984737,   2.36588968,
           7.81765965,  19.66098409,  11.99759682,   8.39473727,
          30.74020841,   4.28467283,   3.24377773]),
  array([ 20.85766398,  13.12193056,   2.32551292,  94.13461566,
          19.08434005,   4.01130208, 100.        ,  10.34510434,
           6.80088885,   4.72114127,   9.83336503,   4.61541964,
          10.30165596,  66.74480966,   9.5404483 ,   4.86106822,
          23.90101514,   3.4846088 ,   1.6920283 ]),
  array([ 21.62823025,  14.47029255,   1.5681667 ,  2

In [100]:
# View for users with many movies in their history the change/movies over time
for username, watched_movies in users_with_longest_movies_histories.items():
    df_user_history = pd.DataFrame(dict([(genre_name, [movie[i] for movie in watched_movies]) for i, genre_name in enumerate(genre_names)]))
    df_user_history = add_colour_labels_to_df_and_sort_by_colour_labels(df_user_history, genre_names, sort_by_colour=False)  # Add colour labels for visualizations

    # Plot movies in 2D
    title = f"PCA (2D) of {len(watched_movies)} movies, {username} has watched"
    print(title)
    visualize_data_with_pca_or_tsne(2, df_user_history, use_pca=True, time_dependence=True, point_size=30, title=title)
    print()
    # time.sleep(10)  # Wait for showing next user

PCA (2D) of 877 movies, 931626 has watched
Variance/Amount of left/remaining information: [0.2449204  0.16120739], lost information: 0.593872210675294
Variance: [1783.22856877 1173.72668711]
Colour labels:
 ['Comedy Drama', 'Action Adventure', 'Drama Thriller', 'Action Thriller', 'Drama Romance', 'Drama History', 'Action Comedy', 'Action Drama', 'Action Science Fiction', 'Crime Drama', 'Drama Mystery', 'Action Crime', 'Comedy Romance', 'Comedy Science Fiction', 'Drama Horror', 'Adventure Science Fiction', 'Adventure Drama', 'Adventure Fantasy', 'Adventure War', 'Animation Family', 'Adventure Comedy', 'Comedy Fantasy', 'Animation Drama', 'Drama Family', 'Comedy Horror', 'Documentary Drama', 'Documentary History', 'Comedy Crime', 'Drama TV Movie', 'Drama War', 'Comedy Western', 'Action History', 'Crime Thriller', 'Drama Music', 'Science Fiction Thriller', 'Adventure Thriller', 'Action Fantasy', 'Documentary Music', 'Drama Fantasy', 'Horror Thriller', 'Drama Western', 'Comedy Thriller', '

# Results:
Here you can see the users specific histories, what they have watched over time. This means, we can see the change of a user's interests. Some users have a change and the most of the users not. The ones without a change only probed some other genres but all in all they stayed at there genres.\
Again there is an information loss, here between 50 % and 65 %, because of 2D PCA computing and adding third dimension (time).\
If the Netflix prize data will be used, then there is still the same result (like the one with TMDB) visible.

# Analyze all movies with t-SNE

In [None]:
# Analyze with t-SNE (2D) all movies
# visualize_data_with_pca_or_tsne(2, df_all_movies_real_genres, use_pca=False, point_size=30, title="t-SNE (2D) of movies, users have watched")

In [None]:
# Analyze with t-SNE (3D) all movies
# visualize_data_with_pca_or_tsne(3, df_all_movies_real_genres, use_pca=False, point_size=30, title="t-SNE (2D) of movies, users have watched")

# Results:
If your computer has enough power, you can try this. but I haven't.

# Analyze all movies, users have watched, with t-SNE

In [106]:
# Analyze with t-SNE (2D) all movies, user have watched
visualize_data_with_pca_or_tsne(2, df_user_movies, use_pca=False, point_size=30, title="t-SNE (2D) of movies, users have watched")

Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation Horror', 'Animation Music', 'Animation Mystery', 'Animation Romance', 'Animation Science Fiction', 'Animation TV Movie', 'Animation Thriller', 'Animation Western', 'Comedy Crime', 'Comedy Documentary', 'Comedy Drama', 'Comedy Family', 'Comedy Fantasy', 'Comedy Horror', 'Come

In [107]:
# Analyze with t-SNE (3D) all movies, user have watched
visualize_data_with_pca_or_tsne(3, df_user_movies, use_pca=False, point_size=30, title="t-SNE (3D) of movies, users have watched")

Colour labels:
 ['Action Adventure', 'Action Animation', 'Action Comedy', 'Action Crime', 'Action Documentary', 'Action Drama', 'Action Family', 'Action Fantasy', 'Action History', 'Action Horror', 'Action Music', 'Action Romance', 'Action Science Fiction', 'Action TV Movie', 'Action Thriller', 'Action War', 'Action Western', 'Adventure Animation', 'Adventure Comedy', 'Adventure Documentary', 'Adventure Drama', 'Adventure Family', 'Adventure Fantasy', 'Adventure History', 'Adventure Romance', 'Adventure Science Fiction', 'Adventure TV Movie', 'Adventure Thriller', 'Adventure War', 'Adventure Western', 'Animation Comedy', 'Animation Documentary', 'Animation Drama', 'Animation Family', 'Animation Fantasy', 'Animation Horror', 'Animation Music', 'Animation Mystery', 'Animation Romance', 'Animation Science Fiction', 'Animation TV Movie', 'Animation Thriller', 'Animation Western', 'Comedy Crime', 'Comedy Documentary', 'Comedy Drama', 'Comedy Family', 'Comedy Fantasy', 'Comedy Horror', 'Come

# Results:
t-SNE leads to better results than PCA:
- 2D-plot: A very mixed picture with many regions, but there are different clusters with different genres. t-SNE show different movie genres better than PCA.
- 3D-plot: There is a ball/solid sphere with many regions/clusters.

All in all t-SNE creates better plots, because of no information loss, but takes more time.

# Analyze movie histories of each user with t-SNE

In [110]:
# View for users with many movies in their history the change/movies over time
for username, watched_movies in users_with_longest_movies_histories.items():
    df_user_history = pd.DataFrame(dict([(genre_name, [movie[i] for movie in watched_movies]) for i, genre_name in enumerate(genre_names)]))
    df_user_history = add_colour_labels_to_df_and_sort_by_colour_labels(df_user_history, genre_names, sort_by_colour=False)  # Add colour labels for visualizations

    # Plot movies in 2D
    title = f"t-SNE (2D) of {len(watched_movies)} movies, {username} has watched"
    print(title)
    visualize_data_with_pca_or_tsne(2, df_user_history, use_pca=False, time_dependence=True, point_size=30, title=title)
    print()
    # time.sleep(10)  # Wait for showing next user

t-SNE (2D) of 877 movies, 931626 has watched
Colour labels:
 ['Comedy Drama', 'Action Adventure', 'Drama Thriller', 'Action Thriller', 'Drama Romance', 'Drama History', 'Action Comedy', 'Action Drama', 'Action Science Fiction', 'Crime Drama', 'Drama Mystery', 'Action Crime', 'Comedy Romance', 'Comedy Science Fiction', 'Drama Horror', 'Adventure Science Fiction', 'Adventure Drama', 'Adventure Fantasy', 'Adventure War', 'Animation Family', 'Adventure Comedy', 'Comedy Fantasy', 'Animation Drama', 'Drama Family', 'Comedy Horror', 'Documentary Drama', 'Documentary History', 'Comedy Crime', 'Drama TV Movie', 'Drama War', 'Comedy Western', 'Action History', 'Crime Thriller', 'Drama Music', 'Science Fiction Thriller', 'Adventure Thriller', 'Action Fantasy', 'Documentary Music', 'Drama Fantasy', 'Horror Thriller', 'Drama Western', 'Comedy Thriller', 'Animation Comedy', 'Comedy Documentary', 'Animation Fantasy', 'Crime Documentary', 'Crime Science Fiction', 'Music Romance', 'Comedy Family', 'Com

# Results:
The development/change of users interests is basically the same as visualized with PCA, but there is a difference using t-SNE:
- The points generated by t-SNE are more clustered/near to each other, so that clusters can be seen

If the Netflix prize data will be used, then there is still the same result (like the one with TMDB) visible.