In [1]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
pd.set_option("display.precision", 1)

from IPython.display import display
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [2]:
def onehot_encode_movies(df_movie):
    # Perform one-hot encoding of genres
    df_movie_encoded = df_movie['genres'].str.get_dummies('|')
    df_movie[['title', 'year']] = df_movie['title'].str.extract(r'^(.*?)\s\((\d{4})\)$')

    df_movie = df_movie.drop("genres", axis=1)
    df_movie = df_movie.drop("title", axis=1)

    # Filter out the "(no genres listed)" column
    if "(no genres listed)" in df_movie_encoded:
        df_movie_encoded = df_movie_encoded.drop("(no genres listed)", axis=1);

    # Join the one-hot encoded genres back to the original DataFrame
    df_movie_final = df_movie.join(df_movie_encoded)
    
    return df_movie_final, df_movie_encoded


In [3]:
def aggregate_average_user_ratings(df_user, df_movie_final, df_movie_encoded):
    df_user_copy = df_user.copy()
    
    # This gets rid of duplicates - keep here just in case
    user_ratings = df_user_copy.groupby('userId').agg({'rating': ['count', 'mean']})
    user_ratings.columns = ['rating count', 'rating avg']

    # Flatten the hierarchical column index
    user_ratings = user_ratings.reset_index()

    df_user_copy = df_user_copy.drop("movieId", axis=1)
    df_user_copy = df_user_copy.drop("rating", axis=1)
    df_user_copy = df_user_copy.drop("timestamp", axis=1)

    for target_user_id in range(1, len(user_ratings) + 1):
        #display(user_ratings['rating count'][target_user_id - 1])
        df_user_copy.loc[df_user_copy['userId'] == target_user_id, 'rating count'] = user_ratings['rating count'][target_user_id - 1]
        df_user_copy.loc[df_user_copy['userId'] == target_user_id, 'rating avg'] = user_ratings['rating avg'][target_user_id - 1]

    df_user_final = df_user_copy.copy()
    df_user_copy = df_user.copy()

    # Get the column names from df_movie_encoded
    column_names = df_movie_encoded.columns.tolist()
    # Add these columns to df_user_copy with NaN values
    for column in column_names:
        df_user_final[column] = None

    # use df_user_copy to get all the movies the user rated, then reference df_movie_final (maybe make copy) to add up the genres      
    # Create an empty dictionary to store user genre ratings
    user_genre_ratings = {}

    num_rows,_ = user_ratings.shape
    _,num_cols = df_user_final.shape
    num_cols -= 3
    genre_rating_count = np.zeros((num_rows,num_cols), dtype=int)

    total_rows = len(df_user_copy)
    genre_count_index = 0
    newUser = 1
    # Initialize the tqdm progress bar with the total number of rows
    progress_bar = tqdm(total=len(df_user_copy), position=0, leave=True)
    # Iterate through each row in ratings.csv
    for index, row in df_user_copy.iterrows():
        userId = row['userId']
        movieId = row['movieId']
        rating = row['rating']

        # Get the genres associated with the movie from df_movie_final
        movie_info = df_movie_final[df_movie_final['movieId'] == movieId]
        # Multiply the rating by 1 or 0 for each genre
        genre_ratings = rating * movie_info.iloc[:, 2:]

        if userId == newUser:
            if userId > 1:
                genre_rating_count[int(userId)-2] = genre_count_row
            genre_count_row = np.zeros((1,num_cols))
            newUser += 1
        else:
            genre_count_row += movie_info.iloc[:, 2:].values

        genre_count_index = int(userId) - 1

        if userId in user_genre_ratings:     
            user_genre_ratings[userId] = user_genre_ratings[userId].add(genre_ratings, fill_value=0)
            user_genre_ratings[userId] = user_genre_ratings[userId].sum().to_frame().T
        else:
            user_genre_ratings[userId] = genre_ratings

        if index == (len(df_user_copy)-1):
            genre_rating_count[int(userId)-1] = genre_count_row
        
        progress_bar.update(1)

    # Convert the dictionary to a list of DataFrames
    dfs = [df.assign(userId=user_id) for user_id, df in user_genre_ratings.items()]
    # Concatenate the list of DataFrames into a single DataFrame
    df_aggregate_rating = pd.concat(dfs, ignore_index=True)
    # Reorder columns to move the last column to the first position
    df_aggregate_rating = df_aggregate_rating[['userId'] + [col for col in df_aggregate_rating.columns if col != 'userId']]
    # Fill NaN values with 0
    df_aggregate_rating.fillna(0, inplace=True)

    df_aggregate_rating_copy = df_aggregate_rating.copy()

    df_aggregate_rating_values = df_aggregate_rating_copy.values
    # Set the "divide" warning to "ignore"
    np.seterr(divide='ignore', invalid='ignore')    
    df_aggregate_rating_values[:, 1:] = np.where(genre_rating_count != 0, df_aggregate_rating_values[:, 1:] / genre_rating_count, 0)
    df_user_final = pd.DataFrame(df_aggregate_rating_values, columns=df_aggregate_rating.columns)
    
    df_genre_rating_count = pd.DataFrame(genre_rating_count)

    # Close the progress bar
    progress_bar.close()
    
    return df_aggregate_rating, df_genre_rating_count, user_ratings, df_user_final

In [4]:
def aggregate_average_user_ratings2(df_user, df_movie_final, df_movie_encoded):
    df_user_copy = df_user.copy()
    
    display(df_user_copy)
    
    # This gets rid of duplicates - keep here just in case
    user_ratings = df_user_copy.groupby('userId').agg({'rating': ['count', 'mean']})
    display(user_ratings)
    user_ratings.columns = ['rating count', 'rating avg']
    display(user_ratings)
    
    # Flatten the hierarchical column index
    user_ratings = user_ratings.reset_index()
    display(user_ratings)

    df_user_copy = df_user_copy.drop("movieId", axis=1)
    df_user_copy = df_user_copy.drop("rating", axis=1)
    df_user_copy = df_user_copy.drop("timestamp", axis=1)

    for target_user_id in range(1, len(user_ratings) + 1):
        #display(user_ratings['rating count'][target_user_id - 1])
        df_user_copy.loc[df_user_copy['userId'] == target_user_id, 'rating count'] = user_ratings['rating count'][target_user_id - 1]
        df_user_copy.loc[df_user_copy['userId'] == target_user_id, 'rating avg'] = user_ratings['rating avg'][target_user_id - 1]

    display(df_user_copy)
    df_user_final = df_user_copy.copy()
    df_user_copy = df_user.copy()

    # Get the column names from df_movie_encoded
    column_names = df_movie_encoded.columns.tolist()
    # Add these columns to df_user_copy with NaN values
    for column in column_names:
        df_user_final[column] = None
    display(df_user_final)

    # use df_user_copy to get all the movies the user rated, then reference df_movie_final (maybe make copy) to add up the genres      
    # Create an empty dictionary to store user genre ratings
    user_genre_ratings = {}

    num_rows,_ = user_ratings.shape
    _,num_cols = df_user_final.shape
    num_cols -= 3
    genre_rating_count = np.zeros((num_rows,num_cols), dtype=int)

    total_rows = len(df_user_copy)
    genre_count_index = 0
    newUser = 1
    # Initialize the tqdm progress bar with the total number of rows
    progress_bar = tqdm(total=len(df_user_copy), position=0, leave=True)
    # Iterate through each row in ratings.csv
    for index, row in df_user_copy.iterrows():
        userId = row['userId']
        movieId = row['movieId']
        rating = row['rating']

        # Get the genres associated with the movie from df_movie_final
        movie_info = df_movie_final[df_movie_final['movieId'] == movieId]
        # Multiply the rating by 1 or 0 for each genre
        genre_ratings = rating * movie_info.iloc[:, 2:]
        
        toDisplayorNotToDisplayThatIsTheQuestion = False
#         if newUser <= 2:
#             toDisplayorNotToDisplayThatIsTheQuestion = True
#         else:
#             toDisplayorNotToDisplayThatIsTheQuestion = False
        if toDisplayorNotToDisplayThatIsTheQuestion:
            display(movie_info)
            display(userId)
            display(genre_ratings)

        if userId == newUser:
            if userId > 1:
                genre_rating_count[int(userId)-2] = genre_count_row
            genre_count_row = np.zeros((1,num_cols))
            newUser += 1
        else:
            genre_count_row += movie_info.iloc[:, 2:].values

        if toDisplayorNotToDisplayThatIsTheQuestion:
            display(genre_count_row)
        genre_count_index = int(userId) - 1

        if userId in user_genre_ratings:   
            user_genre_ratings[userId] = user_genre_ratings[userId].add(genre_ratings, fill_value=0)
            if toDisplayorNotToDisplayThatIsTheQuestion:
                display(user_genre_ratings[userId])
            user_genre_ratings[userId] = user_genre_ratings[userId].sum().to_frame().T
            if toDisplayorNotToDisplayThatIsTheQuestion:
                display(user_genre_ratings[userId])
        else:
            user_genre_ratings[userId] = genre_ratings
            if toDisplayorNotToDisplayThatIsTheQuestion:
                display(user_genre_ratings[userId])
            
        if toDisplayorNotToDisplayThatIsTheQuestion:
            display(user_genre_ratings[userId])
            display(user_genre_ratings)
            
        if index == (len(df_user_copy)-1):
            genre_rating_count[int(userId)-1] = genre_count_row
            
        progress_bar.update(1)

    # Convert the dictionary to a list of DataFrames
    dfs = [df.assign(userId=user_id) for user_id, df in user_genre_ratings.items()]
    # Concatenate the list of DataFrames into a single DataFrame
    df_aggregate_rating = pd.concat(dfs, ignore_index=True)
    # Reorder columns to move the last column to the first position
    df_aggregate_rating = df_aggregate_rating[['userId'] + [col for col in df_aggregate_rating.columns if col != 'userId']]
    # Fill NaN values with 0
    df_aggregate_rating.fillna(0, inplace=True)

    df_aggregate_rating_copy = df_aggregate_rating.copy()

    df_aggregate_rating_values = df_aggregate_rating_copy.values
    # Set the "divide" warning to "ignore"
    np.seterr(divide='ignore', invalid='ignore')    
    df_aggregate_rating_values[:, 1:] = np.where(genre_rating_count != 0, df_aggregate_rating_values[:, 1:] / genre_rating_count, 0)
    df_user_final = pd.DataFrame(df_aggregate_rating_values, columns=df_aggregate_rating.columns)
    
    df_genre_rating_count = pd.DataFrame(genre_rating_count)

    # Close the progress bar
    progress_bar.close()
    
    return df_aggregate_rating, df_genre_rating_count, user_ratings, df_user_final