In [219]:
import pandas as pd
import numpy as np
import math

In [221]:
def read_train_user_data(file_path):
    """
    Reads user-movie rating data from a file and creates a matrix of ratings.

    Parameters:
        file_path (str): The path to the data file containing user, movie, and ratings.

    Returns:
        tuple: A tuple containing the user-movie ratings matrix and the maximum user and movie IDs.
    """
    max_user_id = 0
    max_movie_id = 0

    # Open the file and determine the maximum user and movie IDs
    with open(file_path, 'r') as file:
        for line in file:
            user_id, movie_id, rating = map(int, line.split())
            if user_id > max_user_id:
                max_user_id = user_id
            if movie_id > max_movie_id:
                max_movie_id = movie_id

    # Create a matrix with dimensions based on the highest user and movie IDs
    user_movie_matrix = np.zeros((max_user_id, max_movie_id))

    # Fill the matrix with ratings from the file
    with open(file_path, 'r') as file:
        for line in file:
            user_id, movie_id, rating = map(int, line.split())
            user_movie_matrix[user_id - 1, movie_id - 1] = rating

    return user_movie_matrix, max_user_id, max_movie_id

train_file_path = "train.txt"
user_movie_matrix, max_user_id, max_movie_id = read_train_user_data(train_file_path)

# Convert the numpy matrix to a DataFrame for easier manipulation
train_ratings_dataframe = pd.DataFrame(user_movie_matrix, columns=[str(i) for i in range(user_movie_matrix.shape[1])])


In [223]:
train_ratings_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,5.0,3.0,0.0,3.0,3.0,5.0,0.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
197,4.0,0.0,0.0,3.0,0.0,2.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
198,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [225]:
# Convert the DataFrame 'df' to a NumPy array for faster operations and easier handling in computations.
train_arr = train_ratings_dataframe.to_numpy()

# Print the converted array to verify its content and structure.
print(train_arr)

# Print the shape of the array to understand its dimensions, which indicates the number of users (rows) and movies (columns).
print(train_arr.shape)


[[5. 3. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [4. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [5. 4. 0. ... 0. 0. 0.]]
(200, 1000)


In [227]:
# Read the test data from text files using pandas
# The data is assumed to be space-separated without a header row
test5 = pd.read_csv("test5.txt", sep=" ", header=None)  # Load data for test scenario with 5 ratings per user
test10 = pd.read_csv("test10.txt", sep=" ", header=None)  # Load data for test scenario with 10 ratings per user
test20 = pd.read_csv("test20.txt", sep=" ", header=None)  # Load data for test scenario with 20 ratings per user


In [229]:
# Display the first few rows of the test5 dataset to inspect its structure and data
print(test5.head())
print('\n')

# Display the first few rows of the test10 dataset to inspect its structure and data
print(test10.head())
print('\n')

# Display the first few rows of the test20 dataset to inspect its structure and data
print(test20.head())


     0    1  2
0  201  237  4
1  201  268  5
2  201  306  5
3  201  331  5
4  201  934  5


     0    1  2
0  301    1  3
1  301  172  3
2  301  173  2
3  301  191  2
4  301  242  3


     0    1  2
0  401   77  2
1  401   89  3
2  401  124  3
3  401  137  4
4  401  185  5


In [231]:
def build_user_rating_dictionary(exist_ratings, test_file):
    """
    Creates a user-rating matrix for new users with given ratings, filling unspecified ratings with zeros.

    Parameters:
        exist_ratings (int): The expected number of ratings each user has, used to validate data completeness.
        test_file (DataFrame): The DataFrame containing user ratings data, expected to have columns for user ID, movie ID, and rating.

    Returns:
        dict: A dictionary where each key is a user ID and the value is a list of 1000 ratings (with unrated items as zeros).
    """
    test_users = {}

    # Iterate through each row in the DataFrame
    for index, row in test_file.iterrows():
        # Skip processing if the rating is zero
        if row[2] == 0:
            continue;

        user = row[0]
        movie_index = row[1] - 1 # Convert movie_id to zero-based index for list positioning
        rating = row[2]

        # If the user hasn't been added to the dictionary, add them with a default list of 1000 zeros
        if user not in test_users:
            test_users[user] = [0] * 1000

        # Update the rating for the specific movie index
        test_users[user][movie_index] = rating

    return test_users       


In [374]:
# Create matrices for different sets of test users by processing their ratings files.
# This function is called for each test dataset with a specified number of ratings to process.

# Process test dataset 'test5', which includes users who have rated exactly 5 movies.
test_5_user_dict = build_user_rating_dictionary(5, test5) 

# Process test dataset 'test10', which includes users who have rated exactly 10 movies.
test_10_user_dict = build_user_rating_dictionary(10, test10)

# Process test dataset 'test20', which includes users who have rated exactly 20 movies.
test_20_user_dict = build_user_rating_dictionary(20, test20)

In [376]:
print(test_5_user_dict)

{201: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 

In [610]:
def cos_sim(v1, v2):
    """
    Calculate the cosine similarity between two vectors, filtering out zero entries.

    Parameters:
        v1 (array-like): First vector of ratings.
        v2 (array-like): Second vector of ratings.

    Returns:
        float: The cosine similarity between the two vectors, ignoring entries where either is zero.
    """
    """
    # Convert input to numpy arrays of type float if they aren't already
    v1 = np.array(v1, dtype=float) if not isinstance(v1, np.ndarray) else v1
    v2 = np.array(v2, dtype=float) if not isinstance(v2, np.ndarray) else v2
    
    # Create a mask that identifies positions where both vectors have non-zero entries
    mask = (v1 != 0) & (v2 != 0)
    v1_filtered = v1[mask]
    v2_filtered = v2[mask]

    # Return zero similarity if no corresponding non-zero entries exist
    if len(v1_filtered) == 0:
        return 0

    # Calculate the dot product of the filtered vectors
    numerator = np.dot(v1_filtered, v2_filtered)
    # Calculate the norm (magnitude) of the filtered vectors
    denominator = np.linalg.norm(v1_filtered) * np.linalg.norm(v2_filtered)

    # Prevent division by zero if norms are zero
    if denominator == 0:
        return 0

    # Compute cosine similarity
    sim = numerator / denominator
    return sim
    """
    v1_new = []
    v2_new = []
    for index in range(len(v1)):
        if v1[index] != 0 and v2[index] != 0:
            v1_new.append(v1[index])
            v2_new.append(v2[index])
    #No similar dimenions between users, set sim==0
    if(len(v1_new) == 0 and len(v2_new) == 0):
        sim = 0
        return sim
    """
    #Only 1 dimension that is similar between the two users
    #set sim==inverse euclidean distance
    if(len(v1_new) == 1 and len(v2_new) == 1):
        euclidean_distance = np.linalg.norm(v1_new[0]-v2_new[0])
        sim = 1/(1+euclidean_distance)
        return sim

    """

    
    numerator = np.dot(v1_new,v2_new)
    v1_norm = np.sqrt(np.sum(np.square(v1_new)))
    v2_norm = np.sqrt(np.sum(np.square(v2_new)))
    #calculate the cosine similarity 
    sim = numerator/(v1_norm * v2_norm)
    
    return sim


In [757]:
def create_cos_sim_matrix(users_dict, train_arr):
    """
    Constructs a matrix of cosine similarity scores between a set of active users and all users in the training set.
    
    Parameters:
        users_arr (dict): Dictionary of active users where keys are user IDs and values are rating vectors.
        train_arr (list): List of rating vectors for all users in the training dataset.
    
    Returns:
        list: A 2D list (matrix) where each row represents the cosine similarity scores of an active user with all training users.
    """
    matrix = []  # Initialize an empty list to store the cosine similarity rows
    vals = []
    count = 0

    # Loop through each user in the active user array
    """
    for key in users_dict:
        # Compute cosine similarity for the current user against all users in the training dataset
        # Use list comprehension for efficiency and clarity
        vals = [cos_sim(users_dict[key], row) for row in train_arr]
        # Append the list of cosine similarities for the current user to the matrix
        matrix.append(vals)
    """
    for key in users_dict:
        for row in train_arr:
            cos_sim_value = cos_sim(users_dict[key],row)
            vals.append(cos_sim_value)
            count += 1
            if count == 200:
                matrix.append(vals)
                vals = []
                count = 0

    return matrix  # Return the complete matrix of cosine similarities


In [382]:
def print_matrix_dimensions(matrix):
    print("column: " + str(len(matrix)))
    print("row: " + str(len(matrix[0])) + "\n")
    

In [384]:
cos_sim_matrix_5 = create_cos_sim_matrix(test_5_user_dict, train_arr)
cos_sim_matrix_10 = create_cos_sim_matrix(test_10_user_dict, train_arr)
cos_sim_matrix_20 = create_cos_sim_matrix(test_20_user_dict, train_arr)


In [385]:
print(cos_sim_matrix_5, "\n")
print(cos_sim_matrix_10, '\n')
print(cos_sim_matrix_20, '\n')

[[0.9570244044334736, 0.9938837346736188, 0.9899494936611665, 0, 0, 1.0, 0.9374252720097652, 0, 0, 0, 1.0, 0, 0.9553254290337613, 0, 0.9826204412918687, 1.0, 1.0, 1.0, 1.0, 1.0, 0, 0, 0, 1.0, 0, 1.0, 0, 0, 1.0, 0, 1.0, 1.0, 0, 0, 0, 1.0, 0, 0, 1.0, 1.0, 0, 0.9938837346736188, 1.0, 1.0, 0.9079593845004517, 0, 0.9999999999999998, 1.0, 1.0, 1.0, 0, 1.0, 0, 1.0, 0, 1.0, 1.0, 1.0, 1.0, 0, 1.0, 1.0, 0.994936676326182, 1.0, 0, 1.0, 0, 1.0, 0.9909924304103233, 0, 0, 0, 1.0, 1.0, 1.0, 0, 1.0, 1.0, 0.9999999999999999, 1.0, 1.0, 0, 0, 1.0, 0.9995120760870788, 0, 0, 0, 1.0, 0.9938837346736188, 1.0, 0.9437014308415509, 0, 1.0, 1.0, 0, 0, 0, 0.9385638197228472, 1.0, 1.0, 0, 0, 0.994936676326182, 1.0, 0, 1.0, 1.0, 1.0, 0, 0, 0.9938837346736188, 0.9995120760870788, 0, 1.0, 0.9684959969581862, 1.0, 0, 0.9938837346736188, 1.0, 1.0, 0, 0, 0, 0, 0, 1.0, 0.9682773237093576, 0.9486832980505138, 0.9677620764234547, 0, 0, 0, 0, 0, 1.0, 1.0, 0, 1.0, 1.0, 1.0, 0, 1.0, 1.0, 0.832250378576479, 1.0, 0, 0, 1.0, 1.0

In [386]:
import heapq

def top_k_sim(matrix, k=200):
    """
    This function retrieves the indices of the top k highest cosine similarities 
    for each user's list of similarities in the given matrix.
    
    Parameters:
        matrix (list of lists): A matrix where each sublist represents cosine 
                                similarity scores between one user and all other users.
        k (int): The number of top elements to retrieve for each user's similarity list.
    
    Returns:
        list of lists: Each sublist contains indices of the top k highest values 
                       from each user's similarity list.
    """
    max_index = []  # List to hold the top k indices for each user

    # Iterate through each user's similarity list in the matrix
    for users in matrix:
        # Use heapq.nlargest to find the k largest values' indices in descending order
        top_k_indices = heapq.nlargest(k, range(len(users)), key=users.__getitem__)
        
        max_index.append(top_k_indices)  # Append the top k indices to the main list

    return max_index  # Return the list of lists containing top k indices

# Implementation
top_k_5 = top_k_sim(cos_sim_matrix_5)
top_k_10 = top_k_sim(cos_sim_matrix_10)
top_k_20 = top_k_sim(cos_sim_matrix_20)


In [387]:
print(top_k_5[0], "\n")
print(top_k_10[0], "\n")
print(top_k_20[0], "\n")

[5, 10, 15, 16, 17, 18, 19, 23, 25, 28, 30, 31, 35, 38, 39, 42, 43, 47, 48, 49, 51, 53, 55, 56, 57, 58, 60, 61, 63, 65, 67, 72, 73, 74, 76, 77, 79, 80, 83, 88, 90, 93, 94, 99, 100, 104, 106, 107, 108, 114, 116, 119, 120, 126, 135, 136, 138, 139, 140, 142, 143, 145, 148, 149, 151, 154, 158, 159, 161, 166, 168, 173, 176, 177, 178, 181, 182, 183, 184, 187, 190, 193, 195, 197, 198, 78, 46, 84, 112, 62, 103, 172, 1, 41, 89, 111, 118, 68, 175, 2, 170, 185, 14, 163, 115, 127, 192, 129, 0, 12, 128, 188, 91, 98, 6, 156, 44, 180, 144, 3, 4, 7, 8, 9, 11, 13, 20, 21, 22, 24, 26, 27, 29, 32, 33, 34, 36, 37, 40, 45, 50, 52, 54, 59, 64, 66, 69, 70, 71, 75, 81, 82, 85, 86, 87, 92, 95, 96, 97, 101, 102, 105, 109, 110, 113, 117, 121, 122, 123, 124, 125, 130, 131, 132, 133, 134, 137, 141, 146, 147, 150, 152, 153, 155, 157, 160, 162, 164, 165, 167, 169, 171, 174, 179, 186, 189, 191, 194, 196, 199] 

[19, 65, 123, 2, 3, 18, 20, 26, 28, 30, 32, 36, 37, 38, 44, 46, 49, 51, 52, 60, 66, 73, 74, 80, 86, 87, 88,

In [388]:
def weighted_rating_cos_sim(test_file, shift_index, top_k, cos_sim_matrix, ratings_matrix, default_k, sim_threshold=0.6):
    """
    Predict ratings for movies with a current rating of zero by using weighted cosine similarity.
    
    Parameters:
        test_file (DataFrame): Contains data on user, movie, and existing ratings.
        shift_index (int): Adjusts user ID to match the index in the cosine similarity matrix.
        top_k (list of lists): Each list contains the indices of the top similar users for each user.
        cos_sim_matrix (numpy array): Contains cosine similarity scores between users.
        ratings_matrix (numpy array): Contains ratings given by users to movies.
        default_k (int): Specifies the number of similar users to consider for making predictions.
        sim_threshold (float): Cosine similarity threshold above which users are considered for predictions.

    Returns:
        list: Each item is a list [user, movie, predicted_rating], where predictions are made for zero ratings.
    """
    output = []

    # Iterate over each row in the test dataset to process movies with zero ratings
    for index, row in test_file.iterrows():
        # Check if the current rating is zero
        if row[2] == 0:
            numerator = 0
            denominator = 0
            temp_counter = 0
            user_id = row[0] - shift_index  # Adjust user index
            movie_id = row[1] - 1  # Convert movie ID to zero-based index

            # Process each similar user found in the top_k list for the current user
            for ind in top_k[user_id]:
                cos_sim = cos_sim_matrix[user_id][ind]
                rating = ratings_matrix[ind][movie_id]

                # Only consider non-zero ratings and cosine similarities above the threshold
                if rating != 0 and cos_sim > sim_threshold:
                    temp_counter += 1
                    numerator += (cos_sim * rating)  # Weighted sum of ratings
                    denominator += cos_sim  # Sum of weights

                    # Limit the number of similar users considered to default_k
                    if temp_counter == default_k:
                        break

            # Calculate the final predicted rating
            if numerator != 0 and denominator != 0:
                final_rating = numerator / denominator
                final_rating = int(round(final_rating, 0))
                output.append([row[0], row[1], final_rating])
            else:
                output.append([row[0], row[1], 3])  # Default rating if insufficient data

            # Debugging output to monitor the number of similar users considered
            if temp_counter >= 95:
                print(f"User {row[0]} has {temp_counter} similar users considered.")

    return output


In [394]:
output_5 = weighted_rating_cos_sim(test5, 201, top_k_5, cos_sim_matrix_5, train_arr, 200)

In [395]:
output_10 = weighted_rating_cos_sim(test10, 301, top_k_10, cos_sim_matrix_10, train_arr, 100)

User 320 has 97 similar users considered.
User 345 has 95 similar users considered.


In [396]:
output_20 = weighted_rating_cos_sim(test20, 401, top_k_20, cos_sim_matrix_20, train_arr, 100)

User 407 has 96 similar users considered.
User 407 has 98 similar users considered.
User 411 has 97 similar users considered.
User 414 has 95 similar users considered.
User 423 has 95 similar users considered.
User 424 has 98 similar users considered.
User 430 has 97 similar users considered.
User 439 has 98 similar users considered.
User 440 has 97 similar users considered.
User 444 has 97 similar users considered.
User 447 has 95 similar users considered.
User 456 has 100 similar users considered.
User 465 has 95 similar users considered.
User 466 has 96 similar users considered.
User 469 has 96 similar users considered.
User 473 has 95 similar users considered.
User 498 has 95 similar users considered.


In [398]:
def convert_list_to_str(line):
    """
    Converts a list of elements into a space-separated string.

    Parameters:
        line (list): The list to be converted to a string.

    Returns:
        string (str): A string representation of the list, with elements separated by spaces and without brackets or commas.
    """
    # Convert the list to a string and remove the leading and trailing brackets
    string = str(line)[1:-1]
    # Replace commas with nothing to leave only spaces between elements
    string = string.replace(',', '')
    return string


In [400]:
def write_to_file(output, output_file):
    """
    Writes the formatted output to a specified file.
    
    Parameters:
        output (list): A list of lists, where each sublist is a line to be written to the file.
        output_file (str): The path to the file where the output will be written.
        
    Description:
        This function takes each sublist in the output, converts it to a string using the
        convert_list_to_str function, and writes each string as a new line in the specified file.
    """
    with open(output_file, 'w') as f:  # Open file in write mode
        for line in output:  # Iterate over each line in the output list
            f.write(convert_list_to_str(line))  # Convert the list to a string and write to file
            f.write('\n')  # Add a newline character after each line for proper formatting


In [401]:
write_to_file(output_5, 'cos_sim_result_5_output.txt')
write_to_file(output_10, 'cos_sim_result_10_output.txt')
write_to_file(output_20, 'cos_sim_result_20_output.txt')


In [402]:
"""
Pearson Correlation Coefficient Calculation

Description:
    This section of the code is responsible for computing the Pearson Correlation Coefficient 
    between different sets of data. Pearson Correlation measures the linear relationship between 
    two variables or datasets, providing insights into the strength and direction of their association.

Functions:
    - calculate_pearson_coefficient(data1, data2): Calculates and returns the Pearson Correlation 
      coefficient between two input datasets.
    - apply_pearson_formula(x, y): Applies the Pearson formula to compute the correlation using 
      standard deviation and mean of the inputs.

Usage:
    These functions are typically used in statistical analysis to evaluate correlation which helps 
    in feature selection, data analysis, and predictive modeling in machine learning and data science.

Example:
    Given two user rating vectors for the same movies, the Pearson function can determine how similarly 
    these two users rate movies, which is crucial in recommendation systems.
"""


'\nPearson Correlation Coefficient Calculation\n\nDescription:\n    This section of the code is responsible for computing the Pearson Correlation Coefficient \n    between different sets of data. Pearson Correlation measures the linear relationship between \n    two variables or datasets, providing insights into the strength and direction of their association.\n\nFunctions:\n    - calculate_pearson_coefficient(data1, data2): Calculates and returns the Pearson Correlation \n      coefficient between two input datasets.\n    - apply_pearson_formula(x, y): Applies the Pearson formula to compute the correlation using \n      standard deviation and mean of the inputs.\n\nUsage:\n    These functions are typically used in statistical analysis to evaluate correlation which helps \n    in feature selection, data analysis, and predictive modeling in machine learning and data science.\n\nExample:\n    Given two user rating vectors for the same movies, the Pearson function can determine how simila

In [408]:
def calculate_pearson_adjusted_ratings_for_train(user_ratings_matrix):
    """
    Adjusts user ratings by subtracting the mean rating of each user from their ratings.

    Parameters:
        user_ratings_matrix (list of lists): Matrix where each list contains ratings given by a user.

    Returns:
        tuple: A tuple containing the adjusted ratings matrix and a list of mean ratings for each user.
    """
    adjusted_ratings_matrix = []  # Stores the adjusted ratings matrix after mean normalization
    train_user_means = []  # Stores the mean rating for each user

    # Iterate over each user's ratings in the matrix
    for user_ratings in user_ratings_matrix:
        # Extract non-zero ratings to calculate the mean
        non_zero_ratings = [rating for rating in user_ratings if rating != 0]
        if non_zero_ratings:
            mean_rating = sum(non_zero_ratings) / len(non_zero_ratings)
        else:
            mean_rating = 0  # Default mean rating is 0 if there are no non-zero ratings
        
        # Append the mean rating for the current user to the list
        train_user_means.append(mean_rating)
        
        # Adjust ratings by subtracting the mean rating from each non-zero rating
        adjusted_ratings = [(rating - mean_rating) if rating != 0 else 0 for rating in user_ratings]
        adjusted_ratings_matrix.append(adjusted_ratings)

    return adjusted_ratings_matrix, train_user_means


In [410]:
pearson_train_arr, pearson_train_means = calculate_pearson_adjusted_ratings_for_train(train_arr)

print("Row of pearson_train_arr: " + str(len(pearson_train_arr)))
print("Column of pearson_train_arr: " + str(len(pearson_train_arr[0])))
print("Length of train_pearson_means: " + str(len(pearson_train_means)))

Row of pearson_train_arr: 200
Column of pearson_train_arr: 1000
Length of train_pearson_means: 200


In [751]:
def calculate_pearson_adjusted_ratings_for_test(test_user_ratings_dict):
    """
    adjusted_test_ratings_matrix = []
    test_mean_ratings = []

    for user_id, user_ratings in test_user_ratings_dict.items():
        user_ratings_np = np.array(user_ratings)
        non_zero_mask = user_ratings_np != 0
        non_zero_ratings = user_ratings_np[non_zero_mask]

        if non_zero_ratings.size > 0:
            mean_rating = np.mean(non_zero_ratings)
        else:
            mean_rating = 0

        test_mean_ratings.append(mean_rating)

        adjusted_ratings = np.where(non_zero_mask, user_ratings_np - mean_rating, 0)
        adjusted_test_ratings_matrix.append(adjusted_ratings)

        # Update test user ratings dict
        test_user_ratings_dict[user_id] = adjusted_ratings


    return adjusted_test_ratings_matrix, test_mean_ratings
    """
    adjusted_test_ratings_matrix = []
    test_user_means = []

    for user_id, user_ratings in test_user_ratings_dict.items():
        non_zero_ratings = [rating for rating in user_ratings if rating != 0]
        if non_zero_ratings:
            mean_rating = sum(non_zero_ratings) / len(non_zero_ratings)
        else:
            mean_rating = 0
        
        test_user_means.append(mean_rating)
        
        adjusted_ratings = [(rating - mean_rating) if rating != 0 else 0 for rating in user_ratings]
        adjusted_test_ratings_matrix.append(adjusted_ratings)
        
        # Update test user ratings dict
        test_user_ratings_dict[user_id] = adjusted_ratings

    return adjusted_test_ratings_matrix, test_user_means
    

In [753]:
pearson_test_5 = test_5_user_dict.copy()
pearson_test_10 = test_10_user_dict.copy()
pearson_test_20 = test_20_user_dict.copy()

pearson_test_5_arr, pearson_test_5_means = calculate_pearson_adjusted_ratings_for_test(pearson_test_5)
pearson_test_10_arr, pearson_test_10_means = calculate_pearson_adjusted_ratings_for_test(pearson_test_10)
pearson_test_20_arr, pearson_test_20_means = calculate_pearson_adjusted_ratings_for_test(pearson_test_20)

In [759]:
pearson_matrix_test_5 = create_cos_sim_matrix(pearson_test_5, pearson_train_arr)
pearson_matrix_test_10 = create_cos_sim_matrix(pearson_test_10, pearson_train_arr)
pearson_matrix_test_20 = create_cos_sim_matrix(pearson_test_20, pearson_train_arr)

In [761]:
print(pearson_matrix_test_5)

[[0.8832792199852482, -0.5144957554275263, 0.8371662000158873, 0, 0, -1.0, -0.8852231639223185, 0, 0, 0, -0.09307041363593903, 0, -0.7561645013152865, 0, 0.27784410485078115, -1.0, 1.0, 1.0, -1.0, 1.0, 0, 0, 0, 1.0, 0, -1.0, 0, 0, 1.0, 0, -0.9999999999999999, 1.0, 0, 0, 0, -1.0, 0, 0, -1.0, 1.0, 0, -0.5144957554275263, -1.0, 1.0, -0.39872611141445014, 0, 0.9999999999999999, 1.0, 1.0, 1.0, 0, 1.0, 0, 0.08450232778878462, 0, -1.0, -1.0, 0.19458851485113338, 1.0, 0, -1.0, 1.0, 0.27216552697590846, -1.0, 0, -1.0, 0, -1.0, 0.7516256890704235, 0, 0, 0, -1.0, -1.0, 1.0, 0, 1.0, -1.0, 0.9999999999999999, -1.0, -1.0, 0, 0, -1.0, 0.924406524103778, 0, 0, 0, 1.0, 0.5144957554275262, 1.0, -0.4820657112104792, 0, 1.0, 1.0, 0, 0, 0, -0.9929546911750412, -1.0, -1.0, 0, 0, -0.27216552697590835, 1.0, 0, 1.0, 1.0, -1.0, 0, 0, 0.875387052861874, 0.9582506206705671, 0, 1.0, 0.5170926890922268, 0.2718091136800729, 0, -0.5144957554275263, 1.0, -1.0, 0, 0, 0, 0, 0, -1.0, -0.6300408568675976, -0.9157882087939

In [763]:
pearson_top_k_test_5 = top_k_sim(pearson_matrix_test_5)
pearson_top_k_test_10 = top_k_sim(pearson_matrix_test_10)
pearson_top_k_test_20 = top_k_sim(pearson_matrix_test_20)

In [765]:
print_matrix_dimensions(pearson_top_k_test_5)
print_matrix_dimensions(pearson_top_k_test_10)
print_matrix_dimensions(pearson_top_k_test_20)

column: 100
row: 200

column: 100
row: 200

column: 100
row: 200



In [767]:
print(pearson_top_k_test_5)

[[16, 17, 19, 23, 28, 31, 39, 43, 47, 48, 49, 51, 58, 61, 74, 76, 88, 90, 93, 94, 104, 106, 107, 114, 119, 135, 136, 138, 139, 142, 145, 148, 149, 154, 158, 159, 168, 181, 184, 187, 190, 193, 195, 197, 198, 46, 78, 163, 112, 84, 0, 111, 185, 2, 68, 175, 115, 89, 14, 62, 116, 57, 172, 53, 170, 3, 4, 7, 8, 9, 11, 13, 20, 21, 22, 24, 26, 27, 29, 32, 33, 34, 36, 37, 40, 45, 50, 52, 54, 59, 64, 66, 69, 70, 71, 75, 81, 82, 85, 86, 87, 92, 95, 96, 97, 101, 102, 105, 109, 110, 113, 117, 121, 122, 123, 124, 125, 130, 131, 132, 133, 134, 137, 141, 146, 147, 150, 152, 153, 155, 157, 160, 162, 164, 165, 167, 169, 171, 174, 179, 186, 189, 191, 194, 196, 199, 173, 10, 156, 180, 103, 44, 91, 1, 41, 118, 127, 188, 144, 12, 129, 6, 128, 98, 192, 30, 5, 15, 18, 25, 35, 38, 42, 55, 56, 60, 63, 65, 67, 72, 73, 77, 79, 80, 83, 99, 100, 108, 120, 126, 140, 143, 151, 161, 166, 176, 177, 178, 182, 183], [1, 14, 23, 25, 28, 33, 39, 46, 48, 65, 75, 98, 116, 129, 141, 146, 154, 161, 169, 170, 172, 185, 187, 194,

In [769]:
def weighted_rating_pearson_correlation(test_data, shift_index, pearson_top_k, cos_sim_matrix, test_user_means, rating_matrix, train_user_means, k):
    """
    Calculates weighted ratings using Pearson correlation, considering only the top k similar users.

    Parameters:
        test_data (DataFrame): Test dataset with user IDs, movie IDs, and ratings (some are zero).
        shift_index (int): Offset to align user IDs between test and training datasets.
        pearson_top_k (list): List of indices of the top k similar users for each test user.
        cos_sim_matrix (list): Matrix of cosine similarities between users.
        test_user_means (list): Mean ratings for each test user.
        train_user_means (list): Mean ratings for each training user.
        ratings_matrix (np.ndarray): Matrix containing the ratings from training dataset.
        k (int): Number of similar users to consider for the prediction.

    Returns:
        list: Predicted ratings for movies that were unrated in the test dataset.
    """

    output = []

    # Iterate through each row in the test dataset to predict ratings
    for index, row in test_data.iterrows():
        if row[2] == 0: # Only predict ratings for movies that are currently unrated
            active_user_id = row[0] - shift_index
            movie_id = row[1] - 1
            numerator = 0
            denominator = 0
            counter = 0

            # print(f"Active user ID: {active_user_id}, test_user_means keys: {test_user_means.keys()}")

            test_mean_val = test_user_means[active_user_id]

            # Iterate through the most similar users based on top_k indices
            for ind in pearson_top_k[active_user_id]:
                
                rating = rating_matrix[ind][movie_id]
                mean_val_train_user = train_user_means[ind]
                
                cos_sim = cos_sim_matrix[active_user_id][ind]

                if rating != 0 and cos_sim > 0: # Consider only non-zero ratings and positive similarities.
                    numerator += cos_sim * (rating - mean_val_train_user)
                    denominator += abs(cos_sim)
                    counter += 1

                    if counter == k: # Stop if the number of top similar users considered reaches k
                        break

            # Calculate the predicted rating using weighted average, adjusted by the user's mean.
            if denominator != 0:
                final_rate = test_mean_val + (numerator/ denominator)
                final_rate = round(final_rate)
                final_rate = max(min(final_rate, 5), 1) # Ensure the final rating is within 1 to 5.
                output.append([row[0], row[1], final_rate])
            else:
                output.append([row[0], row[1], 3]) # Default rating if insufficient data.

    return output

In [771]:
pearson_5 = weighted_rating_pearson_correlation(test5, 201, pearson_top_k_test_5, pearson_matrix_test_5, pearson_test_5_means, train_arr, pearson_train_means,100)
pearson_10 = weighted_rating_pearson_correlation(test10, 301, pearson_top_k_test_10, pearson_matrix_test_10, pearson_test_10_means, train_arr, pearson_train_means, 100)
pearson_20 = weighted_rating_pearson_correlation(test20, 401, pearson_top_k_test_20, pearson_matrix_test_20, pearson_test_20_means, train_arr, pearson_train_means, 100)

In [772]:
write_to_file(pearson_5,'pearson_reasult_5_output.txt')
write_to_file(pearson_10,'pearson_reasult_10_output.txt')
write_to_file(pearson_20,'pearson_reasult_20_output.txt')

In [773]:
"""
Pearson with IUF

"""

'\nPearson with IUF\n\n'

In [774]:
def cal_count_train_arr_non_zero(train_arr):
    """
    Calculates the number of non-zero ratings for each movie across all users.
    
    Parameters:
        ratings_matrix (list of lists): 2D list where each sublist represents user ratings for movies.
    
    Returns:
        list: A list where each element is the count of non-zero ratings for corresponding movie.
    """
    movie_ratings_count = [0] * len(train_arr[0])# Initialize count array based on number of movies
    # Iterate over each user's ratings
    for user_rating in train_arr:
        # Iterate over each movie rating
        for movie, rating in enumerate(user_rating):
            if rating != 0:
                movie_ratings_count[movie] += 1

    print('Length of non zero ratings movie count: ' + str(len(movie_ratings_count)))
    
    return movie_ratings_count


In [775]:
def cal_iuf_train_arr(movie_non_zero_ratings_count):
    """
    Calculate the Inverse User Frequency (IUF) for each movie.

    Parameters:
        movie_non_zero_ratings_count (list): A list where each element is the count of non-zero ratings for a movie.

    Returns:
        list: A list of IUF values for each movie.
    """
    iuf_for_train_arr = []
    for i in range(len(movie_non_zero_ratings_count)):
        if movie_non_zero_ratings_count[i] != 0:
            iuf_rating = math.log10(200/ movie_non_zero_ratings_count[i]) # Calculate IUF using log base 10
        else:
            iuf_rating = 1 # Assign a default IUF of 1 when no ratings exist to avoid division by zero
        iuf_for_train_arr.append(iuf_rating)

    print('Length of iuf_training_array: ' + str(len(iuf_for_train_arr)))
    return iuf_for_train_arr
    

In [776]:
def applied_iuf_train_matrix(train_arr, iuf_train_vals):
    """
    Applies Inverse User Frequency (IUF) adjustments to the training data matrix.

    Parameters:
        train_arr (list of lists): 2D list where each sublist represents user ratings for movies.
        iuf_training_vals (list): List of IUF values for each movie based on non-zero ratings counts.

    Returns:
        list of lists: Adjusted training data matrix with ratings scaled by corresponding IUF values.
    """
    iuf_adjusted_train_arr = train_arr.copy() # Copy the original data to avoid modifying it directly
    # Iterate through each user's ratings
    for user_index, user_ratings in enumerate(iuf_adjusted_train_arr):
        # Iterate through each movie rating
        for movie, rating in enumerate(user_ratings):
            if rating != 0: # Only adjust non-zero ratings
                # Multiply the rating by the IUF value for the movie
                iuf_adjusted_train_data[user_index][movie] *= iuf_train_vals[movie]
    
    return iuf_adjusted_train_arr


In [783]:
def final_iuf_train_arr():
    """
    Creates and returns the final training matrix adjusted with Inverse User Frequency (IUF) values.

    Returns:
        list of lists: The adjusted training matrix where each user's ratings are scaled by IUF.
    """
    movie_non_zero_ratings_count = cal_count_train_arr_non_zero(train_arr)
    iuf_for_train_arr = cal_iuf_train_arr(movie_non_zero_ratings_count)
    train_data = train_arr.copy()
    # train_data = train_data.astype(float)

    iuf_adjusted_train_arr = applied_iuf_train_matrix(train_data, iuf_for_train_arr)
    print('Rows of final_iuf_train_data: ' + str(len(iuf_adjusted_train_arr)))
    print('Cols of final_iuf_train_data: ' + str(len(iuf_adjusted_train_arr[0])))

    print(iuf_for_train_arr)
    print(iuf_adjusted_train_arr)
    
    return iuf_adjusted_train_arr  # Return the adjusted matrix instead of an undefined variable

iuf_adjusted_train_arr = final_iuf_train_arr()  # Calling the function to get the IUF adjusted matrix

Length of non zero ratings movie count: 1000
Length of iuf_training_array: 1000
Rows of final_iuf_train_data: 200
Cols of final_iuf_train_data: 1000
[0.4034029043735398, 1.0705810742857074, 1.1249387366083, 0.8696662315049939, 1.2218487496163564, 1.3467874862246563, 0.4034029043735398, 0.7569619513137056, 0.585026652029182, 1.1870866433571445, 0.6777807052660807, 0.6478174818886375, 0.6575773191777938, 0.7958800173440752, 0.5528419686577808, 1.6020599913279623, 1.1870866433571445, 1.6020599913279623, 1.2596373105057561, 1.1870866433571445, 1.0222763947111522, 0.6478174818886375, 0.8386319977650251, 0.7958800173440752, 0.4749551929631548, 1.3010299956639813, 1.3467874862246563, 0.6575773191777938, 1.0705810742857074, 1.5228787452803376, 0.9030899869919435, 1.0, 1.0, 1.8239087409443189, 2.0, 2.0, 2.0, 0.958607314841775, 1.1870866433571445, 1.3010299956639813, 1.4559319556497243, 0.958607314841775, 1.5228787452803376, 1.1870866433571445, 1.1549019599857433, 1.6989700043360187, 0.978810700

In [785]:
iuf_pearson_adjusted_train_arr, iuf_pearson_adjusted_train_means = calculate_pearson_adjusted_ratings_for_train(iuf_adjusted_train_arr)

print("The rows of iuf_train_pearson_arr: " + str(len(iuf_pearson_adjusted_train_arr)))
print("The cols of iuf_train_pearson_arr: " + str(len(iuf_pearson_adjusted_train_arr[0])))
print("The length of iuf_train_pearson_means: " + str(len(iuf_pearson_adjusted_train_means)))

print(iuf_adjusted_train_arr)
print(iuf_pearson_adjusted_train_arr)

The rows of iuf_train_pearson_arr: 200
The cols of iuf_train_pearson_arr: 1000
The length of iuf_train_pearson_means: 200
[[5. 3. 0. ... 0. 0. 0.]
 [4. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [4. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [5. 4. 0. ... 0. 0. 0.]]
[[1.421276595744681, -0.5787234042553191, 0, -0.5787234042553191, -0.5787234042553191, 1.421276595744681, 0, -2.578723404255319, 1.421276595744681, -0.5787234042553191, -1.578723404255319, 1.421276595744681, 0, 1.421276595744681, 0, 1.421276595744681, -0.5787234042553191, 0, 0, 0.4212765957446809, -2.578723404255319, 0.4212765957446809, 0.4212765957446809, -0.5787234042553191, 0.4212765957446809, -0.5787234042553191, -1.578723404255319, 0, 0, -0.5787234042553191, -0.5787234042553191, 1.421276595744681, 0.4212765957446809, -1.578723404255319, -2.578723404255319, -1.578723404255319, -1.578723404255319, -0.5787234042553191, 0.4212765957446809, -0.5787234042553191, -1.578723404255319, 0, 0, 1.421276595744681, 1.42127

In [787]:
def cal_count_test_non_zero(test_user_arr):
    """
    Calculate the number of non-zero ratings for each movie across all test users.

    Parameters:
        test_user_arr (dict): A dictionary where each key is a user ID and the value is a list of ratings.
    
    Returns:
        list: A list where each element is the count of non-zero ratings for corresponding movie.
    """
    # Initialize a list of zeros for counting ratings per movie, assuming 1000 movies
    test_movie_ratings_count = [0] * 1000
    # Iterate over each user's ratings
    for user in test_user_arr:
        # Check each rating for each movie
        for movie, rating in enumerate(test_user_arr[user]):
            if rating != 0:
                test_movie_ratings_count[movie] += 1
    
    return test_movie_ratings_count


In [789]:
# test_5_movie_ratings_count = cal_count_test_non_zero(iuf_pearson_test_5_arr)
# test_10_movie_ratings_count = cal_count_test_non_zero(iuf_pearson_test_10_arr)
# 20_movie_ratings_count = cal_count_test_non_zero(iuf_pearson_test_20_arr)test_

In [791]:
# print('Length of test_5_movie_ratings_count: ' + str(len(test_5_movie_ratings_count)))
# print('Length of test_10_movie_ratings_count: ' + str(len(test_10_movie_ratings_count)))
# print('Length of test_20_movie_ratings_count: ' + str(len(test_20_movie_ratings_count)))
      

In [793]:
def cal_iuf_test_arr(test_movie_ratings_count):
    """
    Calculates the Inverse User Frequency (IUF) for each movie based on test data.

    Parameters:
        test_movie_ratings_count (list): A list where each element represents the count of non-zero ratings for a movie.
    
    Returns:
        list: A list of IUF values for each movie.
    """
    iuf_test_arr = []
    # Iterate over the count of non-zero ratings for each movie
    for count in test_movie_ratings_count:
        # Calculate IUF using the formula: log10(total_users / number_of_users_rated)
        if count != 0:
            iuf_rating = math.log10(200/ count)
        else:
            # If no ratings, set IUF to 1 (to handle division by zero)
            iuf_rating = 1

        iuf_test_arr.append(iuf_rating)

    return iuf_test_arr
        

In [795]:
def applied_iuf_to_test_data(test_user_dict, iuf_for_test_arr):
    """
    Applies Inverse User Frequency (IUF) adjustments to the ratings data for each user in the test dataset.

    Parameters:
        test_user_dict (dict): Dictionary where keys are user IDs and values are lists of ratings.
        iuf_for_test_arr (list): List of IUF values calculated for each movie.
    
    Description:
        This function multiplies each non-zero rating by its corresponding IUF value to adjust the ratings based
        on the frequency of ratings across all users, focusing on movies that are less commonly rated.
    """
    applied_iuf_test_user_dict = {}
    for user_id, ratings in test_user_dict.items():
        # Adjust ratings by multiplying by the IUF value if the rating is non-zero
        adjusted_ratings = [rating * iuf_for_test_arr[movie_index] if rating != 0 else 0 
                           for movie_index, rating in enumerate(ratings)]
        applied_iuf_test_user_dict[user_id] = adjusted_ratings

    return applied_iuf_test_user_dict

In [797]:
def final_iuf_test_data(test_user_dict):
    test_movie_ratings_count = cal_count_test_non_zero(test_user_dict)
    iuf_for_test_arr = cal_iuf_test_arr(test_movie_ratings_count)

    test_user_dict_copy = test_user_dict.copy()

    iuf_adjusted_test_arr = applied_iuf_to_test_data(test_user_dict_copy, iuf_for_test_arr)

    print(test_movie_ratings_count)
    
    return iuf_adjusted_test_arr

iuf_adjusted_test_5_arr = final_iuf_test_data(test_5_user_dict)
iuf_adjusted_test_10_arr = final_iuf_test_data(test_10_user_dict)
iuf_adjusted_test_20_arr = final_iuf_test_data(test_20_user_dict)


[2, 0, 0, 2, 0, 1, 7, 1, 3, 2, 1, 3, 1, 4, 5, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 3, 2, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 5, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 5, 1, 1, 0, 2, 0, 3, 1, 0, 0, 0, 1, 2, 3, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 5, 0, 1, 2, 1, 0, 3, 0, 0, 0, 0, 0, 2, 3, 1, 1, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 4, 1, 0, 0, 0, 1, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 0, 1, 4, 0, 4, 0, 1, 1, 2, 0, 1, 2, 1, 1, 5, 2, 0, 0, 1, 0, 0, 1, 0, 0, 4, 6, 2, 2, 1, 1, 3, 2, 4, 0, 0, 0, 0, 0, 2, 2, 2, 0, 4, 1, 5, 4, 0, 1, 1, 3, 3, 1, 0, 0, 1, 0, 2, 4, 4, 2, 0, 1, 1, 3, 0, 0, 0, 2, 0, 7, 0, 2, 1, 0, 1, 6, 0, 1, 1, 1, 0, 1, 0, 5, 2, 0, 2, 2, 1, 0, 

In [799]:
iuf_pearson_adjusted_test_5_arr, iuf_pearson_adjusted_test_5_means = calculate_pearson_adjusted_ratings_for_test(iuf_adjusted_test_5_arr)
iuf_pearson_adjusted_test_10_arr, iuf_pearson_adjusted_test_10_means = calculate_pearson_adjusted_ratings_for_test(iuf_adjusted_test_10_arr)
iuf_pearson_adjusted_test_20_arr, iuf_pearson_adjusted_test_20_means = calculate_pearson_adjusted_ratings_for_test(iuf_adjusted_test_20_arr)

print("The length of iuf_pearson_adjusted_test_5_means: " + str(len(iuf_pearson_adjusted_test_5_means)))
print(iuf_pearson_adjusted_test_5_arr)

The length of iuf_pearson_adjusted_test_5_means: 100
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1.901029995663981, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1.4061799739838872, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [804]:
iuf_pearson_adjusted_test_5_matrix = create_cos_sim_matrix(iuf_pearson_adjusted_test_5_arr, iuf_pearson_adjusted_train_arr)
iuf_pearson_adjusted_test_10_matrix = create_cos_sim_matrix(iuf_pearson_adjusted_test_10_arr, iuf_pearson_adjusted_train_arr)
iuf_pearson_adjusted_test_20_matrix = create_cos_sim_matrix(iuf_pearson_adjusted_test_20_arr, iuf_pearson_adjusted_train_arr)

TypeError: list indices must be integers or slices, not list