# Analytics Module
The Analytics module provides descriptive statistics on content data, evidence data and model evaluations 

In [1]:
# reloads modules automatically before entering the execution of code
%load_ext autoreload
%autoreload 2

# third parties imports
import numpy as np 
import pandas as pd
# -- add new imports here --

# local imports
from constants import Constant as C
from loaders import load_ratings
from loaders import load_items

ModuleNotFoundError: No module named 'constants'

# 1 - Content analytics
Explore and perform descriptive statistics on content data

In [None]:
# -- load the items and display the Dataframe
df_items = load_items()
print(df_items.head())

In [None]:
# -- display relevant informations that can be extracted from the dataset
n_films = df_items.index.nunique()
n_users = df_items['userId'].nunique()

print(f'Number of films: {n_films}')
print(f'Number of users: {n_users}')

# 2 - Evidence analytics
Explore and perform descriptive statistics on evidence data

In [None]:
# -- load the items and display the Dataframe
df_ratings = load_ratings()
print(df_ratings.head())

In [None]:
# -- display relevant informations that can be extracted from the dataset
n_ratings = df_ratings["rating"].count()
n_users = df_ratings["userId"].nunique()
n_ratings_films_max = df_ratings["movieId"].value_counts().max() 
n_ratings_films_min = df_ratings["movieId"].value_counts().min() 
n_ratings_films_possible = sorted(df_ratings["rating"].unique())
n_films_not_rated = df_items.index.nunique() - df_ratings["movieId"].nunique()
print(f"(a) Total number of ratings : {n_ratings}")
print(f"(b) Total number of unique users : {n_users}")
print(f"(c)) Total number of unique movies : {n_films}")
print(f"(d) Number of ratings for the most rated movie : {n_ratings_films_max}")
print(f"(e) Number of ratings for the less rated movie : {n_ratings_films_min}")
print(f"(f) All possible rating values : {n_ratings_films_possible}")
print(f"(g) Number of movies that were not rated at all : {n_films_not_rated}")

# 3 - Plot and sparsity matrix 

In [None]:
#6. Long-tail property 
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
rating_counts = df_ratings["movieId"].value_counts()
rating_counts_sorted = rating_counts.sort_values(ascending=False)
plt.plot(range(1, len(rating_counts_sorted) + 1), rating_counts_sorted.values)
plt.xlabel('Rank of movies (from most rated to least rated)')
plt.ylabel('Number of ratings')
plt.title('Distribution of ratings per movie (Long-tail property)')
plt.grid(True)
plt.show()

In [None]:
#7. Ratings matrix sparsity
sparsity = 1.0 - (n_ratings / (n_users * n_films))
# Source: https://www.jillcates.com/pydata-workshop/html/tutorial.html
def create_X(df):
    """
    Generates a sparse matrix from ratings dataframe.

    Args:
        df: pandas dataframe containing 3 columns (userId, movieId, rating)

    Returns:
        X: sparse matrix
        user_mapper: dict that maps user id's to user indices
        user_inv_mapper: dict that maps user indices to user id's
        movie_mapper: dict that maps movie id's to movie indices
        movie_inv_mapper: dict that maps movie indices to movie id's
    """
    M = df['userId'].nunique()
    N = df['movieId'].nunique()

    user_mapper = dict(zip(np.unique(df["userId"]), list(range(M))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(N))))

    user_inv_mapper = dict(zip(list(range(M)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(N)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    item_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (user_index,item_index)), shape=(M,N))

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_X(df_ratings)

plt.figure(figsize=(8, 8))
plt.spy(X[0:100, 0:100], markersize=1)
plt.title("Sparse Matrix (100 users x 100 movies)")
plt.xlabel("Movies")
plt.ylabel("Users")
plt.show()
print(f"Sparsity of the ratings matrix: {sparsity:.2%}")