In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

In [4]:
# Load the dataset
filtered_movies = pd.read_csv("Filtered_data.csv")
filtered_movies

Unnamed: 0.1,Unnamed: 0,Title,Release year,Genre,Duration,Rating,Viewership Certificate,User votes,Plot synopsis,Director,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,0,Godzilla vs. Mechagodzilla,1974,"Animation, Action, Adventure",1h 24m,6.2,PG,8000.0,An Okinawan prophecy appears to foretell Earth...,Jun Fukuda,...,0,0,0,0,0,0,0,0,0,0
1,1,Respect,2021,"Biography, Drama, Music",2h 25m,6.6,PG-13,18000.0,Following the rise of Aretha Franklin's career...,Liesl Tommy,...,0,0,0,0,0,0,0,0,0,0
2,2,A New Leaf,1971,"Comedy, Romance",1h 42m,7.3,G,7700.0,Henry Graham lives the life of a playboy. When...,Elaine May,...,0,0,0,0,0,0,0,0,0,0
3,3,Bamboozled,2000,"Comedy, Drama, Music",2h 15m,6.7,R,12000.0,A frustrated African-American TV writer propos...,Spike Lee,...,0,0,0,0,0,0,0,0,0,0
4,4,A Cowgirl's Story,2017,"Drama, Family",1h 38m,5.6,PG,706.0,Dusty Rhodes (Bailee Madison) & her grandfathe...,Timothy Armstrong,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29135,29135,The Wedding Invitation,1932,"Horror, Mystery",57m,4.1,Passed,1100.0,People in an old dark house on a stormy night ...,Frank R. Strayer,...,0,0,0,0,0,0,0,0,0,0
29136,29136,The Monster Walks,2001,"Action, Adventure, Comedy",1h 23m,2.8,R,982.0,An Antarctic drilling station is under attack ...,John Carl Buechler,...,0,0,0,0,0,0,0,0,0,0
29137,29137,Deep Freeze,2020,Documentary,1h 40m,8.3,PG-13,1900.0,Former Chief Official White House Photographer...,Dawn Porter,...,0,0,0,0,0,0,0,0,0,0
29138,29138,Under the Influencer,1947,Western,3h,6.3,Approved,171.0,"On the run from Missouri, outlaw Jesse James a...",Fred C. Brannon,...,0,0,0,0,0,0,0,0,0,1


In [5]:
filtered_movies.rename(columns={
    'Release year': 'Release_year',
    'User votes': 'User_votes',
    'Plot synopsis': 'Plot_synopsis',
    'Poster Link': 'Poster_Link'
}, inplace=True)

In [6]:
filtered_movies.columns

Index(['Unnamed: 0', 'Title', 'Release_year', 'Genre', 'Duration', 'Rating',
       'Viewership Certificate', 'User_votes', 'Plot_synopsis', 'Director',
       'Poster_Link', 'Action', 'Adventure', 'Animation', 'Biography',
       'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy',
       'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'Romance', 'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western'],
      dtype='object')

In [7]:
movie_nlp = filtered_movies[['Title','Release_year','Genre','Duration','Rating','User_votes','Plot_synopsis','Director','Poster_Link']]
movie_nlp.to_csv("sql_data.csv")

In [8]:
movie_nlp['combined_features'] = movie_nlp['Title'] + ' ' + movie_nlp['Plot_synopsis'] + ' ' + movie_nlp['Genre'] + ' ' + ' ' + movie_nlp['Genre'] + ' '+ ' ' + movie_nlp['Genre'] + ' ' + movie_nlp['Director']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_nlp['combined_features'] = movie_nlp['Title'] + ' ' + movie_nlp['Plot_synopsis'] + ' ' + movie_nlp['Genre'] + ' ' + ' ' + movie_nlp['Genre'] + ' '+ ' ' + movie_nlp['Genre'] + ' ' + movie_nlp['Director']


In [9]:
# Vectorize combined features using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_nlp['combined_features'])

In [10]:
# Convert to sparse matrix (csr_matrix)
tfidf_matrix_sparse = csr_matrix(tfidf_matrix)

In [11]:
import pickle
with open ("model.pkl","wb") as file:
    pickle.dump((tfidf_matrix_sparse, tfidf, movie_nlp), file)

In [12]:
# # Compute cosine similarity
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# cosine_sim

In [13]:
def get_recommendations(title, min_rating=0):

    # Check if the movie title exists
    if title not in movie_nlp['Title'].values:
        return "Movie not found in the database."

    # Get the index of the movie that matches the title
    idx = movie_nlp[movie_nlp['Title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie on-the-fly
    sim_scores = cosine_similarity(tfidf_matrix_sparse[idx], tfidf_matrix_sparse).flatten()

    # Sort the movies based on similarity scores
    sim_scores_indices = np.argsort(sim_scores)[::-1]

    # Get the scores of the 15 most similar movies
    sim_scores_indices = sim_scores_indices[1:16]

    # Filter movies based on the minimum rating
    similar_movies = movie_nlp.iloc[sim_scores_indices]
    similar_movies = similar_movies[similar_movies['Rating'] >= min_rating]

    # Return the titles of the top recommendations
    return similar_movies[['Title','Rating','Genre','Director','User_votes','Plot_synopsis','Duration','Release_year','Poster_Link']]


In [14]:
# Example usage
recommended_movies = (get_recommendations('The Avengers', min_rating=2))
print(recommended_movies)

                              Title  Rating                      Genre  \
17899       Avengers: Age of Ultron     7.3  Action, Adventure, Sci-Fi   
18843                      Serenity     7.8  Action, Adventure, Sci-Fi   
23395                  The Avengers     3.8  Action, Adventure, Sci-Fi   
14668                      Gangland     2.8             Action, Sci-Fi   
21537                      Spectral     6.3  Action, Adventure, Sci-Fi   
12690  Rogue Warrior: Robot Fighter     3.1             Action, Sci-Fi   
18585                Captain Marvel     6.8  Action, Adventure, Sci-Fi   
18162             War of the Worlds     6.5  Action, Adventure, Sci-Fi   
18308                      Superman     7.4  Action, Adventure, Sci-Fi   
28390          Rain Without Thunder     4.2     Action, Horror, Sci-Fi   
27823                      Fall Guy     5.1  Action, Adventure, Sci-Fi   
17683                      Godzilla     6.4  Action, Adventure, Sci-Fi   
18054    Captain America: Civil War   

In [15]:
for index, row in recommended_movies.iterrows():
    print(row["Title"], row["Genre"])

Avengers: Age of Ultron Action, Adventure, Sci-Fi
Serenity Action, Adventure, Sci-Fi
The Avengers Action, Adventure, Sci-Fi
Gangland Action, Sci-Fi
Spectral Action, Adventure, Sci-Fi
Rogue Warrior: Robot Fighter Action, Sci-Fi
Captain Marvel Action, Adventure, Sci-Fi
War of the Worlds Action, Adventure, Sci-Fi
Superman Action, Adventure, Sci-Fi
Rain Without Thunder Action, Horror, Sci-Fi
Fall Guy Action, Adventure, Sci-Fi
Godzilla Action, Adventure, Sci-Fi
Captain America: Civil War Action, Sci-Fi
Avengers: Infinity War Action, Adventure, Sci-Fi
Zone Troopers Action, Adventure, Sci-Fi
