In [None]:
%pip install pandas numpy scikit-learn gradio tensorboard >/dev/null 

In [60]:

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import gradio as gr
from sklearn.metrics import explained_variance_score
from tensorboardX import SummaryWriter

In [None]:
writer = SummaryWriter(log_dir='./logs')

# MovieLens data set
This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.

Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.

The data are contained in the files links.csv, movies.csv, ratings.csv and tags.csv.

In [None]:
from zipfile import ZipFile
import os

# Upload and extract the ml-latest-small.zip file
!wget -q https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
with ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Check basic data
print("Movies dataset shape:", movies.shape)
print("Ratings dataset shape:", ratings.shape)
movies.head(), ratings.head()


In [None]:
import pandas as pd
import numpy as np

# Check for missing values
print("Missing values in ratings:", ratings.isnull().sum())

# Remove duplicates if any
ratings = ratings.drop_duplicates()

# Create a user-item interaction matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

# Normalize the matrix (optional)
#outlier detection needs to be done before normalization as it can affect the
#standardScaler function output
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix)

# Convert the NumPy array to a pandas DataFrame
user_item_matrix_scaled_df = pd.DataFrame(user_item_matrix_scaled)

# Now you can use the head method
display(user_item_matrix_scaled_df)

In [63]:
# Define the custom RMSE scoring function
def rmse_scorer(model, X, y):
    # Predict ratings by multiplying user-item matrix with SVD components
    predicted_ratings = model.transform(X).dot(model.components_)
    # Compute RMSE
    return np.sqrt(mean_squared_error(y, predicted_ratings))


In [None]:
# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(user_item_matrix_scaled)

# Convert the SVD results back into a DataFrame for easier understanding
svd_df = pd.DataFrame(svd_matrix, index=user_item_matrix.index)

# Show the SVD results
display(svd_df)


In [None]:
# Perform cross-validation with RMSE scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(user_item_matrix_scaled):
    X_train, X_test = user_item_matrix_scaled[train_index], user_item_matrix_scaled[test_index]
    y_train, y_test = user_item_matrix_scaled[train_index], user_item_matrix_scaled[test_index]
    svd.fit(X_train)
    cv_scores = cross_val_score(svd, X_test, y_test, cv=kf, scoring=rmse_scorer)

# Show the results
print("Cross-validated RMSE scores:", cv_scores)
print("Mean RMSE:", np.mean(cv_scores))


In [None]:
user_factors = svd.transform(user_item_matrix_scaled)  # Shape: (n_users, n_components)
item_factors = svd.components_.T                       # Shape: (n_items, n_components)

print("User Factors (Reduced Representation):\n", user_factors)
print("Item Factors (Reduced Representation):\n", item_factors)

In [None]:
# New user vector (9724 items), e.g., ratings or interactions
new_user_vector = np.zeros((1, 9724))  # Single new user with all zeros
# new_user_vector[0, [1, 3, 6, 10, 15]] = [4, 3, 5, 2, 1]  # Example ratings
# Filter movies by a specific genre, e.g., 'Action'
genre = 'Action'
filtered_movies = movies[movies['genres'].str.contains(genre, case=False, na=False)]

# Update the new user vector with average ratings for movies in the selected genre
for movie_id in filtered_movies['movieId']:
    avg_rating = np.mean(ratings[ratings['movieId'] == movie_id]['rating'])
    new_user_vector[0, user_item_matrix.columns.get_loc(movie_id)] = avg_rating

# Transform the new user into reduced space
new_user_factors = svd.transform(new_user_vector)
print("New User Factors:\n", new_user_factors)
print("Shape of New User Factors:", new_user_factors.shape)


In [None]:
# Reconstruct the approximate user-item matrix
reconstructed_matrix = np.dot(user_factors, item_factors.T)

print("Reconstructed User-Item Matrix:\n", reconstructed_matrix)
print("Shape of Reconstructed User-Item Matrix:", reconstructed_matrix.shape)
display(pd.DataFrame(reconstructed_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns))

In [None]:
# Assume svd is already trained as shown in the previous part

def recommend_movies(user_input):
    # Step 1: Filter movies by genre based on user input
    user_input = user_input.lower()
    # Ensure we are considering only those movie_ids that are present in both ratings and movies table
    common_movie_ids = set(ratings['movieId']).intersection(set(movies['movieId']))
    filtered_movies = movies[movies['movieId'].isin(common_movie_ids) & movies['genres'].str.contains(user_input, case=False, na=False)]

    # Step 2: Handle case when no movies are found for the genre
    if filtered_movies.empty:
        return ["No movies found for this genre."]

    # Step 3: Simulate a user vector with neutral ratings for simplicity
    user_vector = np.zeros((1, user_item_matrix.shape[1]))  # Initialize with zeros for all movies

    # Update the user vector with average ratings for movies in the selected genre
    for movie_id in filtered_movies['movieId']:
        avg_rating = np.mean(ratings[ratings['movieId'] == movie_id]['rating'])
        user_vector[0, user_item_matrix.columns.get_loc(movie_id)] = avg_rating
    
    # Step 4: Get predicted ratings using the trained SVD model
    predicted_ratings = svd.transform(user_vector).dot(svd.components_)

    # Step 5: Get top 5 recommended movie titles
    top_indices = np.argsort(predicted_ratings[0])[-15:][::-1]  # Sort and get top 5
    recommended_movie_titles = movies.iloc[top_indices]['title'].tolist()

    return recommended_movie_titles

# Create the Gradio interface
iface = gr.Interface(
    fn=recommend_movies,
    inputs="text",
    outputs="text",
    title="Movie Recommendation System",
    description="Enter a genre or movie preference and get personalized movie recommendations."
)

# Launch the interface
iface.launch()

# recommend_movies(user_input="ACTION")
