In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
import itertools
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
%matplotlib inline
# Import the Movies dataset
movies = pd.read_csv('data/movie.csv')
movies.head()

In [None]:
# Import the ratings dataset
ratings = pd.read_csv('data/rating.csv')
ratings.head()

In [None]:
# Function to get the genre ratings
def get_genre_ratings(ratings, movies, genres, column_names):
    genre_ratings = pd.DataFrame()
    for genre in genres:        
        genre_movies = movies[movies['genres'].str.contains(genre) ]
        avg_genre_votes_per_user = ratings[ratings['movieId'].isin(genre_movies['movieId'])].loc[:, ['userId', 'rating']].groupby(['userId'])['rating'].mean().round(2)
        
        genre_ratings = pd.concat([genre_ratings, avg_genre_votes_per_user], axis=1)
        
    genre_ratings.columns = column_names
    return genre_ratings
# Calculate the average rating of romance and scifi movies
genre_ratings = get_genre_ratings(ratings, movies, ['Romance', 'Sci-Fi'], ['avg_romance_rating', 'avg_scifi_rating'])
genre_ratings.head()

In [None]:
# Function to get the biased dataset
def bias_genre_rating_dataset(genre_ratings, score_limit_1, score_limit_2):
    biased_dataset =    genre_ratings[((genre_ratings['avg_romance_rating'] < score_limit_1 - 0.2) & (genre_ratings['avg_scifi_rating'] > score_limit_2)) | ((genre_ratings['avg_scifi_rating'] < score_limit_1) & (genre_ratings['avg_romance_rating'] > score_limit_2))]
    biased_dataset = pd.concat([biased_dataset[:300], genre_ratings[:2]])
    biased_dataset = pd.DataFrame(biased_dataset.to_records())
    return biased_dataset
# Bias the dataset
biased_dataset = bias_genre_rating_dataset(genre_ratings, 3.2, 2.5)
# Printing the resulting number of records & the head of the dataset
print( "Number of records: ", len(biased_dataset))
biased_dataset.head()

In [None]:
# Defining the scatterplot drawing function
def draw_scatterplot(x_data, x_label, y_data, y_label):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    plt.xlim(0, 5)
    plt.ylim(0, 5)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.scatter(x_data, y_data, s=30)
# Plot the scatterplot
draw_scatterplot(biased_dataset['avg_scifi_rating'],'Avg scifi rating', biased_dataset['avg_romance_rating'], 'Avg romance rating')

In [None]:
# Let's turn our dataset into a list
X = biased_dataset[['avg_scifi_rating','avg_romance_rating']].values
# Import KMeans
from sklearn.cluster import KMeans
# Create an instance of KMeans to find two clusters
kmeans_1 = KMeans(n_clusters=2)
# Use fit_predict to cluster the dataset
predictions = kmeans_1.fit_predict(X)

# Defining the cluster plotting function
def draw_clusters(biased_dataset, predictions, cmap='viridis'):
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    plt.xlim(0, 5)
    plt.ylim(0, 5)
    ax.set_xlabel('Avg scifi rating')
    ax.set_ylabel('Avg romance rating')
    clustered = pd.concat([biased_dataset.reset_index(), pd.DataFrame({'group':predictions})], axis=1)
    plt.scatter(clustered['avg_scifi_rating'], clustered['avg_romance_rating'], c=clustered['group'], s=20, cmap=cmap)

# Plot
draw_clusters(biased_dataset, predictions)

In [None]:
# Create an instance of KMeans to find three clusters
kmeans_2 = KMeans(n_clusters=3)
# Use fit_predict to cluster the dataset
predictions_2 = kmeans_2.fit_predict(X)
# Plot
draw_clusters(biased_dataset, predictions_2)

In [None]:
# Przycinanie tabeli na podstawie popularności filmów
# Obliczanie liczby ocen dla każdego filmu
movie_counts = ratings['movieId'].value_counts()

# Lista najpopularniejszych filmów (np. 1000 najpopularniejszych)
top_movies = movie_counts.head(5000) #.index.tolist()

# Przycinanie do najpopularniejszych filmów
ratings_pruned = ratings[ratings['movieId'].isin(top_movies)]


In [None]:
# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings_pruned, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
# Print he number of dimensions and a subset of the dataset
print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

In [None]:
# Define Function to get the most rated movies
def get_most_rated_movies(user_movie_ratings, max_number_of_movies):
    # 1- Count
    count_series = user_movie_ratings.count()
    count_df = pd.DataFrame([count_series.values], columns=count_series.index)
    user_movie_ratings_with_count = pd.concat([user_movie_ratings, count_df], ignore_index=True)
    
    # 2- sort
    user_movie_ratings_sorted = user_movie_ratings_with_count.sort_values(len(user_movie_ratings_with_count)-1, axis=1, ascending=False)
    user_movie_ratings_sorted = user_movie_ratings_sorted.drop(user_movie_ratings_sorted.tail(1).index)
    
    # 3- slice
    most_rated_movies = user_movie_ratings_sorted.iloc[:, :max_number_of_movies]
    
    return most_rated_movies

def get_users_who_rate_the_most(user_movie_ratings, max_number_of_users):
    # 1- Count
    user_movie_ratings['total_ratings'] = user_movie_ratings.count(axis=1)
    
    # 2- Sort
    user_movie_ratings_sorted = user_movie_ratings.sort_values(by='total_ratings', ascending=False)
    
    # 3- Slice
    users_who_rate_the_most = user_movie_ratings_sorted.iloc[:max_number_of_users, :-1]  # Exclude 'total_ratings' column
    
    return users_who_rate_the_most

# Define the sorting by rating function
def sort_by_rating_density(user_movie_ratings, n_movies, n_users):
    most_rated_movies = get_most_rated_movies(user_movie_ratings, n_movies)
    most_rated_movies = get_users_who_rate_the_most(most_rated_movies, n_users)
    return most_rated_movies

In [None]:
# Define the sorting by rating function
def sort_by_rating_density(user_movie_ratings, n_movies, n_users):
    most_rated_movies = get_most_rated_movies(user_movie_ratings, n_movies)
    most_rated_movies = get_users_who_rate_the_most(most_rated_movies, n_users)
    return most_rated_movies
# choose the number of movies and users and sort
n_movies = 30
n_users = 18
most_rated_movies_users_selection = sort_by_rating_density(user_movie_ratings, n_movies, n_users)
# Print the result
print('dataset dimensions: ', most_rated_movies_users_selection.shape)
most_rated_movies_users_selection.head()

In [None]:
# Define the plotting heatmap function
def draw_movies_heatmap(most_rated_movies_users_selection, axis_labels=True):
    
    fig = plt.figure(figsize=(15,4))
    ax = plt.gca()
    
    # Draw heatmap
    heatmap = ax.imshow(most_rated_movies_users_selection,  interpolation='nearest', vmin=0, vmax=5, aspect='auto')
    if axis_labels:
        ax.set_yticks(np.arange(most_rated_movies_users_selection.shape[0]) , minor=False)
        ax.set_xticks(np.arange(most_rated_movies_users_selection.shape[1]) , minor=False)
        ax.invert_yaxis()
        ax.xaxis.tick_top()
        labels = most_rated_movies_users_selection.columns.str[:40]
        ax.set_xticklabels(labels, minor=False)
        #ax.set_yticklabels(most_rated_movies_users_selection.index, minor=False)
        plt.setp(ax.get_xticklabels(), rotation=90)
        ax.get_yaxis().set_visible(False)
    else:
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
    
    ax.grid(False)
    ax.set_ylabel('User id')
# Separate heatmap from color bar
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="5%", pad=0.05)
# Color bar
    cbar = fig.colorbar(heatmap, ticks=[5, 4, 3, 2, 1, 0], cax=cax)
    cbar.ax.set_yticklabels(['5 stars', '4 stars','3 stars','2 stars','1 stars','0 stars'])
plt.show()
# Print the heatmap
draw_movies_heatmap(most_rated_movies_users_selection)

In [None]:
# Pivot the dataset and choose the first 1000 movies
user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
print(user_movie_ratings.shape)
most_rated_movies_1k = get_most_rated_movies(user_movie_ratings, 1000)

In [None]:
type(most_rated_movies_1k)

In [None]:
most_rated_movies_1k

In [None]:
# Conversion to sparse csr matrix
#sparse_ratings = csr_matrix(pd.SparseDataFrame(most_rated_movies_1k).to_coo())
sparse_ratings = csr_matrix(most_rated_movies_1k)

In [None]:
sparse_ratings

In [None]:
# Impute NaN values with mean
imputer = SimpleImputer(strategy='mean')
sparse_ratings_imputed = imputer.fit_transform(sparse_ratings)

In [None]:
print(sparse_ratings_imputed) # NaN zastąpiliśmy średnią oceną dla danego filmu

In [None]:
class KMeansClustering:
    def __init__(self, X, num_clusters):
        self.K = num_clusters
        self.max_iterations = 100
        self.num_examples = X.shape[0]
        self.num_features = X.shape[1]

    def initialize_random_centroids(self, X):
        centroids = np.zeros((self.K, self.num_features))

        for k in range(self.K):
            centroid = X[np.random.choice(range(self.num_examples))]
            centroids[k] = centroid

        return centroids

    def create_clusters(self, X, centroids):
        # Will contain a list of the points that are associated with that specific cluster
        clusters = [[] for _ in range(self.K)]

        # Loop through each point and check which is the closest cluster
        for point_idx, point in enumerate(X):
            closest_centroid = np.argmin(
                np.sqrt(np.sum((point - centroids) ** 2, axis=1))
            )
            clusters[closest_centroid].append(point_idx)

        return clusters

    def calculate_new_centroids(self, clusters, X):
        centroids = np.zeros((self.K, self.num_features))
        for idx, cluster in enumerate(clusters):
            new_centroid = np.mean(X[cluster], axis=0)
            centroids[idx] = new_centroid

        return centroids

    def predict_cluster(self, clusters, X):
        y_pred = np.zeros(self.num_examples)

        for cluster_idx, cluster in enumerate(clusters):
            for sample_idx in cluster:
                y_pred[sample_idx] = cluster_idx

        return y_pred

    def plot_fig(self, X, y):
        plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.Spectral)
        plt.show()

    def fit(self, X):
        centroids = self.initialize_random_centroids(X)

        for it in range(self.max_iterations):
            clusters = self.create_clusters(X, centroids)

            previous_centroids = centroids
            centroids = self.calculate_new_centroids(clusters, X)

            diff = centroids - previous_centroids

            if not diff.any():
                print("Termination criterion satisfied")
                break

        # Get label predictions
        y_pred = self.predict_cluster(clusters, X)

        return y_pred

In [None]:
most_rated_movies_1k_small = get_users_who_rate_the_most(most_rated_movies_1k, 10000)
imputer = SimpleImputer(strategy='mean')
small_imputed = imputer.fit_transform(most_rated_movies_1k_small)
small = pd.DataFrame(small_imputed)
small = pd.DataFrame(small_imputed, columns=most_rated_movies_1k_small.columns)
selected_rows = small.index[:10000]
selected_columns = small.columns[:1000]

small = small.loc[selected_rows, selected_columns]

small


In [None]:
num_clusters = 7
X = small.values
Kmeans = KMeansClustering(X, num_clusters)
y_pred = Kmeans.fit(X)
y_pred

In [None]:
# Dodaj nową kolumnę do small z przypisanymi grupami
small['group'] = y_pred
small

In [None]:
# Funkcja do narysowania heatmap dla wszystkich grup
def draw_heatmaps_for_groups(clustered, n_movies, n_users):
    unique_groups = clustered['group'].unique()
    
    for group in unique_groups:
        # Wybierz grupę
        group_data = clustered[clustered['group'] == group].copy()  # Użyj .copy() tutaj
        
        # Usuń niepotrzebne kolumny i posortuj dane
        selected_data = group_data.drop(columns=['group'])
        selected_data = selected_data.head(n_users)
        selected_data = selected_data.iloc[:, :n_movies]
        
        # Narysuj heatmapę
        draw_movies_heatmap(selected_data)
        plt.title(f'Heatmap dla grupy {group}')
        plt.show()

# Użyj funkcji
draw_heatmaps_for_groups(small, 40, 30)


In [None]:
# 12 clusters
predictions = KMeans(n_clusters=12).fit_predict(sparse_ratings_imputed)

In [None]:
def draw_movie_clusters(clustered, max_users, max_movies):
    n_clusters = clustered['group'].nunique()
    
    # Create a figure and subplots
    fig, axes = plt.subplots(n_clusters, 1, figsize=(20, 4 * n_clusters))
    
    # Iterate over clusters and draw heatmap
    for i, (group_idx, group) in enumerate(clustered.groupby('group')):
        most_rated_movies_users_selection = group.drop(columns=['group']).reset_index(drop=True)
        ax = axes[i]
        draw_movies_heatmap(most_rated_movies_users_selection, ax)
        ax.set_title(f"Cluster {i}")
    
    plt.tight_layout()
    plt.show()

In [None]:
# Select the mas number of users and movies heatmap cluster
max_users = 70
max_movies = 50
# Cluster and print some of them
clustered = pd.concat([most_rated_movies_1k.reset_index(), pd.DataFrame({'group':predictions})], axis=1)

In [None]:
clustered

In [None]:
# W grupie 8 jest 309 osób 
group8 = clustered[clustered["group"] == 8]

In [None]:
# Wyświetlenie wszystkiego na raz nie jest szczególnie czytelne. 
draw_movies_heatmap(group8)

In [None]:
# Należy ograniczyć do użytkowników, którzy dali najwięcej opinii i filmów, które były najczęściej oceniane w tej grupie 
wybrane = group8.drop(columns = ['group', 'index'])
wybrane = sort_by_rating_density(wybrane, 20, 50)

In [None]:
draw_movies_heatmap(wybrane)

## Wnioski z otrzymanej heatmapy
Podobieństwo kolorów w kolumnach wskazuje na podobieństwo gustów użytkowników. Widać wyraźnie że użytkownikom z grupy 8 nie podobał się film "Independence day" za to wszystkim wyraźnie podobał się film "Pulp Fiction"
Porównajmy to jeszcze z inną grupą

In [None]:
group12 = clustered[clustered["group"] == 12]
wybrane12 = group12.drop(columns = ['group', 'index'])
wybrane12 = sort_by_rating_density(wybrane12, 20, 50)
draw_movies_heatmap(wybrane12)

In [None]:
group14 = clustered[clustered["group"] == 14]
wybrane14 = group14.drop(columns = ['group', 'index'])
wybrane14 = sort_by_rating_density(wybrane14, 20, 50)
draw_movies_heatmap(wybrane14)

## Wnioski z trzech heatmap
Grupa 14 zdaje się być użytkownikami, ktorzy generalnie lepiej oceniają obejrzane filmy. Independence day to słaby film

In [None]:
# Zmieniam tą funkcję, żeby każda heatmapa miała kolumny w takiej samej kolejności, wtedy łatwiej znaleźć różnice między grupami

def sort_by_rating_density_changed(user_movie_ratings, n_movies, n_users, popularne_filmy):
    tylko_popularne_filmy = user_movie_ratings[popularne_filmy.columns].copy()  # Użyj .copy() tutaj
    gesta_czesc = get_users_who_rate_the_most(tylko_popularne_filmy, n_users)
    return gesta_czesc

In [None]:
# Funkcja do narysowania heatmap dla wszystkich grup
def draw_heatmaps_for_groups(clustered, n_movies, n_users, most_rated_movies_1k):
    unique_groups = clustered['group'].unique()

    popularne_filmy = get_most_rated_movies(most_rated_movies_1k, n_movies)
    
    for group in unique_groups:
        # Wybierz grupę
        group_data = clustered[clustered['group'] == group].copy()  # Użyj .copy() tutaj
        
        # Usuń niepotrzebne kolumny i posortuj dane
        selected_data = group_data.drop(columns=['group', 'index'])
        selected_data = sort_by_rating_density_changed(selected_data, n_movies, n_users, popularne_filmy)
        
        # Narysuj heatmapę
        draw_movies_heatmap(selected_data)
        plt.title(f'Heatmap dla grupy {group}')
        plt.show()

        # Oblicz średnie oceny dla pierwszych filmów w grupie
        mean_ratings = selected_data[popularne_filmy.columns].mean()
        
        # # Wyświetl tabelę ze średnimi ocenami
        # print("\nŚrednie oceny dla pierwszych filmów w grupie:")
        # display(mean_ratings.to_frame().transpose())  # Wyświetl jako tabelę DataFrame
        # print("\n" + "-"*40 + "\n")  # Separator


# Użyj funkcji
draw_heatmaps_for_groups(clustered, 40, 70, most_rated_movies_1k)


In [None]:
def get_mean_ratings_for_groups(clustered, n_movies, n_users, most_rated_movies_1k):
    unique_groups = clustered['group'].unique()
    popularne_filmy = get_most_rated_movies(most_rated_movies_1k, n_movies)
    
    mean_ratings_list = []
    user_counts_list = []
    
    for group in unique_groups:
        # Wybierz grupę
        group_data = clustered[clustered['group'] == group].copy()
        
        # Usuń niepotrzebne kolumny i posortuj dane
        selected_data = group_data.drop(columns=['group', 'index'])
        #selected_data = sort_by_rating_density_changed(selected_data, n_movies, n_users, popularne_filmy)
        
        # Oblicz średnie oceny dla wybranych filmów w grupie
        mean_ratings = selected_data[popularne_filmy.columns].mean()
        
        mean_ratings_list.append(mean_ratings)
        
        # Oblicz liczbę użytkowników w grupie
        user_count = selected_data.shape[0]  # liczba wierszy to liczba użytkowników
        user_counts_list.append(user_count)
    
    # Tworzenie DataFrame z listy średnich ocen i liczby użytkowników
    mean_ratings_df = pd.DataFrame(mean_ratings_list, index=unique_groups)
    mean_ratings_df['user_count'] = user_counts_list
    
    return mean_ratings_df

# Użyj funkcji
mean_ratings_df = get_mean_ratings_for_groups(clustered, 40, 70, most_rated_movies_1k)
mean_ratings_df = mean_ratings_df[mean_ratings_df['user_count'] > 20]
mean_ratings_df


In [None]:
# Przedstawienie średnich dla grup heatmapą
draw_movies_heatmap(mean_ratings_df)

In [None]:
# Przycinanie tabeli na podstawie popularności filmów
# Obliczanie liczby ocen dla każdego filmu
movie_counts = ratings['movieId'].value_counts()

# Lista najpopularniejszych filmów (np. 1000 najpopularniejszych)
top_movies = movie_counts.head(5000) #.index.tolist()

# Przycinanie do najpopularniejszych filmów
ratings_pruned = ratings[ratings['movieId'].isin(top_movies)]

# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings_pruned, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = get_most_rated_movies(user_movie_ratings, 1000)

most_rated_movies_1k_small = get_users_who_rate_the_most(most_rated_movies_1k, 10000)
imputer = SimpleImputer(strategy='mean')
small_imputed = imputer.fit_transform(most_rated_movies_1k_small)


# small = pd.DataFrame(small_imputed)
small = pd.DataFrame(small_imputed, columns=most_rated_movies_1k_small.columns)
selected_rows = small.index[:10000]
selected_columns = small.columns[:1000]

small = small.loc[selected_rows, selected_columns]

num_clusters = 7
X = small.values

# Tutaj dzieje się cała magia
Kmeans = KMeansClustering(X, num_clusters)
y_pred = Kmeans.fit(X)

# Dodaj nową kolumnę do small z przypisanymi grupami
small['group'] = y_pred

small
# Teraz dla wybranego użytkownika trzeba określić jego grupę

In [None]:
# Weźmy użytkownika id = 0
user_row = small[small.iloc[:, 0] == 0]
grupa_zerowego =  user_row['group'].iloc[0]
grupa_zerowego

In [None]:
user_id = 0
group = small.loc[small['title'] == user_id, 'group'].iloc[0]
group

In [None]:
small['Jurassic Park (1993)']

In [None]:
# Przycinanie tabeli na podstawie popularności filmów
# Obliczanie liczby ocen dla każdego filmu
movie_counts = ratings['movieId'].value_counts()

# Lista najpopularniejszych filmów (np. 1000 najpopularniejszych)
top_movies = movie_counts.head(5000) #.index.tolist()

# Przycinanie do najpopularniejszych filmów
ratings_pruned = ratings[ratings['movieId'].isin(top_movies)]

# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings_pruned, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')
most_rated_movies_1k = get_most_rated_movies(user_movie_ratings, 1000)

most_rated_movies_1k_small = get_users_who_rate_the_most(most_rated_movies_1k, 10000)

imputer = SimpleImputer(strategy='mean')
small_imputed = imputer.fit_transform(most_rated_movies_1k_small)

small = pd.DataFrame(small_imputed, columns=most_rated_movies_1k_small.columns)
selected_rows = small.index[:10000]
selected_columns = small.columns[:1000]

small = small.loc[selected_rows, selected_columns]
small

num_clusters = 7
X = small.values

# Tutaj dzieje się cała magia
Kmeans = KMeansClustering(X, num_clusters)
y_pred = Kmeans.fit(X)

# Dodaj nową kolumnę do small z przypisanymi grupami
# small['group'] = y_pred
small.insert(0, 'userId', most_rated_movies_1k_small.index)
small.insert(1, 'group', y_pred)

small

In [None]:
# Teraz dla wybranego użytkownika trzeba określić jego grupę
user_id = 902
user_row = small.loc[small['userId'] == user_id]

# Sprawdź wartość w kolumnie 'group' dla wybranego użytkownika
user_group = user_row['group'].values[0]
user_group

In [None]:
cluster_number = 1
cluster = small[small.group == cluster_number].drop(['group'], axis=1)
cluster

In [None]:
# Get all this user's ratings
user_id = 23
user_2_ratings  = most_rated_movies_1k_small.loc[user_id, :]

# Which movies did they not rate? 
user_2_unrated_movies =  user_2_ratings[user_2_ratings.isnull()]
# What are the ratings of these movies the user did not rate?
avg_ratings = pd.concat([user_2_unrated_movies, cluster.mean()], axis=1, join='inner').loc[:,0]
# Let's sort by rating so the highest rated movies are presented first
avg_ratings.sort_values(ascending=False)[:20]