In [10]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset
dataset_path = '/content/dataset.csv'
df = pd.read_csv(dataset_path)

# Step 2: Extract track names (adjust the column name based on the dataset)
track_names = df['track_name']  # Replace 'track_name' with the actual column name

# Step 3: Generate random ratings (1-5) and some NaNs
num_users = 50
ratings = np.random.randint(1, 6, size=(len(track_names), num_users))  # Ratings from 1 to 5

# Step 4: Introduce NaNs based on a set probability (e.g., 20% chance of NaN)
nan_probability = 0.2  # 20% probability of NaN
nan_mask = np.random.rand(len(track_names), num_users) < nan_probability
ratings = ratings.astype(float)  # Convert to float to allow NaNs
ratings[nan_mask] = np.nan

# Step 5: Create a new DataFrame with ratings and track names
ratings_df = pd.DataFrame(ratings, columns=[f'user_{i+1}' for i in range(num_users)])
ratings_df['track_name'] = track_names.values

# Step 6: Rearrange columns to have track_name first
ratings_df = ratings_df[['track_name'] + [f'user_{i+1}' for i in range(num_users)]]

# Step 7: Save the new dataset
ratings_df.to_csv('spotify_tracks_with_random_ratings.csv', index=False)

print("New dataset created with track names and random ratings (with NaNs)!")


New dataset created with track names and random ratings (with NaNs)!


In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset (replace 'spotify_tracks_with_random_ratings.csv' with your actual file path)
ratings_df = pd.read_csv('spotify_tracks_with_random_ratings.csv')
ratings_df.set_index('track_name', inplace=True)

# Transpose the DataFrame for user-based filtering (users as rows, items as columns)
ratings_user_based = ratings_df.T


In [12]:
# Fill NaN with 0 for similarity calculation (optional - depends on method)
ratings_user_filled = ratings_user_based.fillna(0)
user_cosine_sim = cosine_similarity(ratings_user_filled)

# Convert to a DataFrame for easier access
user_cosine_sim_df = pd.DataFrame(user_cosine_sim, index=ratings_user_based.index, columns=ratings_user_based.index)


In [13]:
user_pearson_corr = ratings_user_based.corr(method='pearson')

In [14]:
# Fill NaN with 0 for item-based similarity calculation
ratings_item_filled = ratings_df.fillna(0)
item_cosine_sim = cosine_similarity(ratings_item_filled)

# Convert to a DataFrame for easier access
item_cosine_sim_df = pd.DataFrame(item_cosine_sim, index=ratings_df.index, columns=ratings_df.index)


In [15]:
item_pearson_corr = ratings_df.corr(method='pearson')

In [19]:
def predict_user_based(user_id, item_id, similarity_df, ratings_matrix):
    # Get similar users and their similarities
    sim_users = similarity_df[user_id]

    # Extract the ratings of similar users for the target item
    sim_ratings = ratings_matrix[item_id]

    # Weighted average of ratings with similarity weights, ignoring NaNs
    numerator = np.nansum(sim_users * sim_ratings)
    denominator = np.nansum(np.abs(sim_users))

    # Return the prediction or NaN if denominator is zero (no similar users)
    return numerator / denominator if denominator != 0 else np.nan

# Example: Predict rating for user 'user_1' on item 'track_name_1'
user_id = 'user_1'
item_id = 'Comedy'
user_based_prediction = predict_user_based(user_id, item_id, user_cosine_sim_df, ratings_user_based)
print(f"User-based prediction for user '{user_id}' on item '{item_id}':", user_based_prediction)


User-based prediction for user 'user_1' on item 'Comedy': 2.3057887992016752


In [20]:
def predict_item_based(user_id, item_id, similarity_df, ratings_matrix):
    # Get similar items and their similarities
    sim_items = similarity_df[item_id]

    # Extract the ratings for the target user across similar items
    sim_ratings = ratings_matrix.loc[user_id]

    # Weighted average of ratings with similarity weights, ignoring NaNs
    numerator = np.nansum(sim_items * sim_ratings)
    denominator = np.nansum(np.abs(sim_items))

    # Return the prediction or NaN if denominator is zero (no similar items)
    return numerator / denominator if denominator != 0 else np.nan

# Example: Predict rating for user 'user_1' on item 'track_name_1'
item_based_prediction = predict_item_based(user_id, item_id, item_cosine_sim_df, ratings_df.T)
print(f"Item-based prediction for user '{user_id}' on item '{item_id}':", item_based_prediction)


Item-based prediction for user 'user_1' on item 'Comedy': 2.4246244712137104
