In [None]:
# !pip install gradio scikit-learn pandas numpy

In [1]:
%pip install pandas numpy scikit-learn gradio

import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, KFold
import gradio as gr


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
from zipfile import ZipFile
import os

# Upload and extract the ml-latest-small.zip file
# !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
with ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# Check basic data
print("Movies dataset shape:", movies.shape)
print("Ratings dataset shape:", ratings.shape)
movies.head(), ratings.head()


Movies dataset shape: (9742, 3)
Ratings dataset shape: (100836, 4)


(   movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
    userId  movieId  rating  timestamp
 0       1        1     4.0  964982703
 1       1        3     4.0  964981247
 2       1        6     4.0  964982224
 3       1       47     5.0  964983815
 4       1       50     5.0  964982931)

In [16]:
import pandas as pd
import numpy as np

# Check for missing values
print("Missing values in ratings:", ratings.isnull().sum())

# Remove duplicates if any
ratings = ratings.drop_duplicates()

# Create a user-item interaction matrix
user_item_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating', fill_value=0)

# Normalize the matrix (optional)
#outlier detection needs to be done before normalization as it can affect the
#standardScaler function output
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
user_item_matrix_scaled = scaler.fit_transform(user_item_matrix)

# Convert the NumPy array to a pandas DataFrame
user_item_matrix_scaled_df = pd.DataFrame(user_item_matrix_scaled)

# Now you can use the head method
display(user_item_matrix_scaled_df.head())


Missing values in ratings: userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723
0,1.351355,-0.451377,3.877305,-0.102112,-0.282698,2.212582,-0.296949,-0.108158,-0.156937,-0.50648,...,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522
1,-0.713333,-0.451377,-0.289453,-0.102112,-0.282698,-0.437087,-0.296949,-0.108158,-0.156937,-0.50648,...,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522
2,-0.713333,-0.451377,-0.289453,-0.102112,-0.282698,-0.437087,-0.296949,-0.108158,-0.156937,-0.50648,...,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522
3,-0.713333,-0.451377,-0.289453,-0.102112,-0.282698,-0.437087,-0.296949,-0.108158,-0.156937,-0.50648,...,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522
4,1.351355,-0.451377,-0.289453,-0.102112,-0.282698,-0.437087,-0.296949,-0.108158,-0.156937,-0.50648,...,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522,-0.040522


In [5]:
# Define the custom RMSE scoring function
def rmse_scorer(model, X, y):
    # Predict ratings by multiplying user-item matrix with SVD components
    predicted_ratings = model.transform(X).dot(model.components_)
    # Compute RMSE
    return np.sqrt(mean_squared_error(y, predicted_ratings))


In [6]:
# Apply Truncated SVD for matrix factorization
svd = TruncatedSVD(n_components=50, random_state=42)
svd_matrix = svd.fit_transform(user_item_matrix_scaled)

# Convert the SVD results back into a DataFrame for easier understanding
svd_df = pd.DataFrame(svd_matrix, index=user_item_matrix.index)

# Show the SVD results
print(svd_df.head())


              0         1         2         3         4         5         6   \
userId                                                                         
1       4.562492 -0.149098  0.348287  2.473971 -3.384682  0.121423 -1.742425   
2      -8.023881  0.263295 -1.138552 -1.664633 -1.423625 -1.943960 -0.970332   
3      -8.242660  0.566647 -1.301136 -1.892789 -1.556872 -1.886818 -1.285589   
4       1.290247  4.620039 -1.927515  0.105165 -3.318396  4.467783 -1.593049   
5      -7.931517  1.262333 -1.744985 -1.600606 -2.213035 -1.417978 -1.248382   

              7         8         9   ...        40        41        42  \
userId                                ...                                 
1       0.807162 -4.754663 -7.146543  ... -2.072484  4.367784  6.695107   
2      -1.740035  2.736711  2.362845  ... -0.576224  0.302201 -0.208763   
3      -3.244104  2.664849  0.802229  ... -1.084267  1.882293  1.203937   
4      -3.927323  2.097055 -5.305602  ...  0.138131  0.404978  0

In [7]:
# Perform cross-validation with RMSE scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation on the SVD model
cv_scores = cross_val_score(svd, user_item_matrix_scaled, y=None, cv=kf, scoring=rmse_scorer)

# Show the results
print("Cross-validated RMSE scores:", cv_scores)
print("Mean RMSE:", np.mean(cv_scores))


Traceback (most recent call last):
  File "C:\Users\Ramro\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: rmse_scorer() missing 1 required positional argument: 'y'

Traceback (most recent call last):
  File "C:\Users\Ramro\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: rmse_scorer() missing 1 required positional argument: 'y'

Traceback (most recent call last):
  File "C:\Users\Ramro\AppData\Local\Packages\PythonSoftwareFoundation.Python.3

Cross-validated RMSE scores: [nan nan nan nan nan]
Mean RMSE: nan


Traceback (most recent call last):
  File "C:\Users\Ramro\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\metrics\_scorer.py", line 143, in __call__
    score = scorer(estimator, *args, **routed_params.get(name).score)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: rmse_scorer() missing 1 required positional argument: 'y'



In [8]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
import gradio as gr

# Assume svd is already trained as shown in the previous part

def recommend_movies(user_input):
    # Step 1: Filter movies by genre based on user input
    user_input = user_input.lower()
    filtered_movies = movies[movies['genres'].str.contains(user_input, case=False, na=False)]

    print(filtered_movies.head())  # Add this for debugging


    # Step 2: Handle case when no movies are found for the genre
    if filtered_movies.empty:
        return ["No movies found for this genre."]

    # Step 3: Simulate a user vector with neutral ratings for simplicity
    user_vector = np.zeros((1, len(movies)))  # Initialize with zeros for all movies

    # Update the user vector with average ratings for movies in the selected genre
    for idx in filtered_movies.index:
        movie_id = filtered_movies.loc[idx, 'movieId']
        avg_rating = np.mean(ratings[ratings['movieId'] == movie_id]['rating'])
        user_vector[0, idx] = avg_rating

    # Step 4: Get predicted ratings using the trained SVD model
    predicted_ratings = svd.transform(user_vector).dot(svd.components_)

    # Step 5: Get top 5 recommended movie titles
    top_indices = np.argsort(predicted_ratings[0])[-5:][::-1]  # Sort and get top 5
    recommended_movie_titles = movies.iloc[top_indices]['title'].tolist()

    return recommended_movie_titles

# Create the Gradio interface
iface = gr.Interface(
    fn=recommend_movies,
    inputs="text",
    outputs="text",
    title="Movie Recommendation System",
    description="Enter a genre or movie preference and get personalized movie recommendations."
)

# Launch the interface
iface.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Created dataset file at: .gradio\flagged\dataset1.csv
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
Empty DataFrame
Columns: [movieId, title, genres]
Index: []
