In [1]:
import pandas as pd
import ast

file_path = "movies.csv"
movies_data = pd.read_csv(file_path)

def preprocess_movies_data(data):

    data = data.dropna(subset=['genres', 'title', 'vote_average', 'vote_count'])

    def extract_genres(genre_str):
        try:
            genres = ast.literal_eval(genre_str)
            return [genre['name'] for genre in genres]
        except (ValueError, KeyError, TypeError):
            return []

    data['genres'] = data['genres'].apply(extract_genres)

    # Filter movies with significant vote count
    data = data[data['vote_count'] > 50]
    return data

# Apply preprocessing
cleaned_movies_data = preprocess_movies_data(movies_data)

# View cleaned data
print(cleaned_movies_data.head())


      budget                                         genres  \
0  237000000  [Action, Adventure, Fantasy, Science Fiction]   
1  300000000                   [Adventure, Fantasy, Action]   
2  245000000                     [Action, Adventure, Crime]   
3  250000000               [Action, Crime, Drama, Thriller]   
4  260000000           [Action, Adventure, Science Fiction]   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a user-item matrix
def create_user_item_matrix(data):
    user_item_matrix = data.pivot_table(index='id', columns='title', values='vote_average').fillna(0)
    return user_item_matrix

# Calculate user similarity
def calculate_user_similarity(matrix):
    user_similarity = cosine_similarity(matrix)
    return pd.DataFrame(user_similarity, index=matrix.index, columns=matrix.index)

# Generate user-item matrix and calculate similarity
user_item_matrix = create_user_item_matrix(cleaned_movies_data)
user_similarity_matrix = calculate_user_similarity(user_item_matrix)

# View user similarity for the first 5 users
print(user_similarity_matrix.iloc[:5, :5])


id   5    11   12   13   14
id                         
5   1.0  0.0  0.0  0.0  0.0
11  0.0  1.0  0.0  0.0  0.0
12  0.0  0.0  1.0  0.0  0.0
13  0.0  0.0  0.0  1.0  0.0
14  0.0  0.0  0.0  0.0  1.0


In [None]:
# Calculate item-based similarity
def calculate_item_similarity(matrix):
    item_similarity = cosine_similarity(matrix.T)
    return pd.DataFrame(item_similarity, index=matrix.columns, columns=matrix.columns)

# Generate item similarity matrix
item_similarity_matrix = calculate_item_similarity(user_item_matrix)

# Recommend movies
def recommend_movies(movie_title, similarity_matrix, num_recommendations=5):
    if movie_title not in similarity_matrix.columns:
        return f"Movie '{movie_title}' not found in the dataset."
    similarity_scores = similarity_matrix[movie_title]
    similar_movies = similarity_scores.sort_values(ascending=False)
    return similar_movies.index[1:num_recommendations+1].tolist()

# Get recommendations for a movie
recommendations = recommend_movies("Avatar", item_similarity_matrix)
print(recommendations)



['#Horror', 'Spy Kids 3-D: Game Over', 'Spirited Away', 'Splash', 'Splice']


In [None]:
from sklearn.decomposition import TruncatedSVD

# Apply SVD
def apply_svd(matrix, n_components=20):
    svd = TruncatedSVD(n_components=n_components)
    reduced_matrix = svd.fit_transform(matrix)
    return reduced_matrix, svd

# Generate reduced matrix
reduced_matrix, svd_model = apply_svd(user_item_matrix)
print(f"Shape of reduced matrix: {reduced_matrix.shape}")


Shape of reduced matrix: (3639, 20)


In [None]:
from sklearn.metrics import mean_squared_error

# Example: Calculate RMSE for dummy predictions
def evaluate_rmse(actual, predicted):
    return mean_squared_error(actual, predicted, squared=False)

# Example usage with mock data
actual = [4, 3, 5]
predicted = [3.8, 3.2, 4.9]
rmse = evaluate_rmse(actual, predicted)
print(f"RMSE: {rmse}")


RMSE: 0.17320508075688779




In [None]:
def user_interface():
    print("Welcome to the Movie Recommendation System!")
    while True:
        movie_title = input("Enter a movie title you like (or 'exit' to quit): ")
        if movie_title.lower() == 'exit':
            print("Thank you for using the system!")
            break
        recommendations = recommend_movies(movie_title, item_similarity_matrix)
        if isinstance(recommendations, str):
            print(recommendations)
        else:
            print(f"Recommendations for '{movie_title}': {', '.join(recommendations)}")

# Run the interface
user_interface()


Welcome to the Movie Recommendation System!
Enter a movie title you like (or 'exit' to quit): Avatar
Recommendations for 'Avatar': #Horror, Spy Kids 3-D: Game Over, Spirited Away, Splash, Splice
Enter a movie title you like (or 'exit' to quit): exit
Thank you for using the system!


In [None]:
# Display all unique movie titles
movie_titles = cleaned_movies_data['title'].unique()
print(f"Number of movies: {len(movie_titles)}")
print(movie_titles[:10])  # Display the first 10 movie titles


Number of movies: 3637
['Avatar' "Pirates of the Caribbean: At World's End" 'Spectre'
 'The Dark Knight Rises' 'John Carter' 'Spider-Man 3' 'Tangled'
 'Avengers: Age of Ultron' 'Harry Potter and the Half-Blood Prince'
 'Batman v Superman: Dawn of Justice']


In [None]:
import pandas as pd
import ast
from sklearn.metrics.pairwise import cosine_similarity

# Function to recommend movies
def recommend_movies(movie_title, similarity_matrix):
    if movie_title not in similarity_matrix.columns:
        return f"Movie '{movie_title}' not found in the dataset."
    similarity_scores = similarity_matrix[movie_title]
    similar_movies = similarity_scores.sort_values(ascending=False)
    return similar_movies.index[1:11].tolist()  # Top 10 recommendations

# User Interface Function
def user_interface():
    print("Welcome to the Movie Recommendation System!")
    print("You can filter your recommendations by genre, release year, and rating.")

    while True:
        movie_title = input("\nEnter a movie title you like (or 'exit' to quit): ")
        if movie_title.lower() == 'exit':
            print("Thank you for using the Movie Recommendation System. Goodbye!")
            break

        # Get user filters
        genre_filter = input("Enter a genre to filter by (or press Enter to skip): ").strip()
        year_filter = input("Enter a release year to filter by (or press Enter to skip): ").strip()
        min_rating = input("Enter minimum rating (0-10, or press Enter to skip): ").strip()

        # Convert filters to proper types
        year_filter = int(year_filter) if year_filter.isdigit() else None
        min_rating = float(min_rating) if min_rating.replace('.', '', 1).isdigit() else 0

        # Get recommendations
        recommendations = recommend_movies(movie_title, item_similarity_matrix)
        if isinstance(recommendations, str):  # If error message is returned
            print(recommendations)
            continue

        # Filter recommendations based on user inputs
        filtered_recommendations = []
        for movie in recommendations:
            movie_data = cleaned_movies_data[cleaned_movies_data['title'] == movie]
            if movie_data.empty:
                continue
            movie_data = movie_data.iloc[0]

            # Check genre filter
            if genre_filter and genre_filter.lower() not in [g.lower() for g in movie_data['genres']]:
                continue


            if year_filter and str(year_filter) != movie_data['release_date'][:4]:
                continue

            if movie_data["vote_average"] < min_rating:
                continue

            filtered_recommendations.append(movie)

        if filtered_recommendations:
            print(f"\nRecommendations for '{movie_title}':")
            print(", ".join(filtered_recommendations))
        else:
            print(f"No recommendations found for '{movie_title}' with your specified filters.")

user_interface()


Welcome to the Movie Recommendation System!
You can filter your recommendations by genre, release year, and rating.

Enter a movie title you like (or 'exit' to quit): Avatar
Enter a genre to filter by (or press Enter to skip): Action
Enter a release year to filter by (or press Enter to skip): 2009
Enter minimum rating (0-10, or press Enter to skip): 8
No recommendations found for 'Avatar' with your specified filters.

Enter a movie title you like (or 'exit' to quit): exit
Thank you for using the Movie Recommendation System. Goodbye!


In [None]:
# Extract unique genres from the dataset
unique_genres = set(genre for genres_list in cleaned_movies_data['genres'] for genre in genres_list)
print(f"Number of unique genres: {len(unique_genres)}")
print(unique_genres)


Number of unique genres: 19
{'TV Movie', 'Thriller', 'Adventure', 'Family', 'Crime', 'Music', 'Romance', 'Mystery', 'Drama', 'Western', 'Animation', 'Comedy', 'History', 'Science Fiction', 'Fantasy', 'Documentary', 'Action', 'War', 'Horror'}


In [None]:
def get_movie_genres(movie_titles):

    movie_genre_dict = {}
    for title in movie_titles:
        movie_data = cleaned_movies_data[cleaned_movies_data['title'] == title]
        if not movie_data.empty:
            genres = movie_data['genres'].iloc[0]
            movie_genre_dict[title] = genres
        else:
            movie_genre_dict[title] = "Movie not found in the dataset"
    return movie_genre_dict

movie_list = ["Avatar", "The Dark Knight", "Iron Man", "Movie not in dataset"] # Example movie titles
movie_genres = get_movie_genres(movie_list)
for title, genres in movie_genres.items():
  print(f"{title}: {genres}")

Avatar: ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
The Dark Knight: ['Drama', 'Action', 'Crime', 'Thriller']
Iron Man: ['Action', 'Science Fiction', 'Adventure']
Movie not in dataset: Movie not found in the dataset


In [None]:
print(cleaned_movies_data.head())


      budget                                         genres  \
0  237000000  [Action, Adventure, Fantasy, Science Fiction]   
1  300000000                   [Adventure, Fantasy, Action]   
2  245000000                     [Action, Adventure, Crime]   
3  250000000               [Action, Crime, Drama, Thriller]   
4  260000000           [Action, Adventure, Science Fiction]   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "name": "spy"}, {"id": 818, 

In [None]:
print(cleaned_movies_data[cleaned_movies_data['title'] == 'Avatar']['genres'])


0    [Action, Adventure, Fantasy, Science Fiction]
Name: genres, dtype: object


In [None]:
print(cleaned_movies_data[cleaned_movies_data['title'] == 'Avatar']['release_date'])


0    10-12-2009
Name: release_date, dtype: object


In [None]:
print(cleaned_movies_data[cleaned_movies_data['title'] == 'Avatar']['vote_average'])


0    7.2
Name: vote_average, dtype: float64


In [None]:
print(cleaned_movies_data[cleaned_movies_data['title'] == 'Avatar']['vote_average'])



0    7.2
Name: vote_average, dtype: float64


In [None]:
['Action', 'Adventure', 'Sci-Fi']


['Action', 'Adventure', 'Sci-Fi']

In [None]:
print(movies_data[movies_data['title'] == 'Avatar'])



      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   

  original_title                                           overview  \
0         Avatar  In the 22nd century, a paraplegic Marine is di...   

   popularity                               production_companies  \
0  150.437577  [{"name": "Ingenious Film Partners", "id": 289...   

                                production_countries release_date     revenue  \
0  [{"iso_3166_1": "US", "name": "United States o...   10-12-2009  2787965087   

   runtime                                   spoken_languages    status  \
0    162.0  [{"iso_639_1": "en", "name": "English"}, {"iso...  Released   

                       tagline   title 

In [None]:
print(movies_data.shape)
print(movies_data.columns)


(4803, 20)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')


In [None]:


import pandas as pd
import ast
from sklearn.metrics.pairwise import cosine_similarity

try:
    movies_data = pd.read_csv("movies.csv")
except FileNotFoundError:
    print("Error: 'movies.csv' not found. Please make sure the file is in the same directory as the script.")
    exit()

def recommend_movies(movie_title, similarity_matrix, genre_filter=None, year_filter=None, min_rating=0):
    if movie_title not in similarity_matrix.columns:
        return f"Movie '{movie_title}' not found in the dataset."

    similarity_scores = similarity_matrix[movie_title]
    similar_movies = similarity_scores.sort_values(ascending=False)

    recommendations = []
    for movie in similar_movies.index[1:11]:  # Top 10 recommendations
      movie_data = cleaned_movies_data[cleaned_movies_data['title'] == movie]

      if movie_data.empty:
          continue

      movie_data = movie_data.iloc[0]

      if genre_filter and genre_filter.lower() not in [g.lower() for g in movie_data['genres']]:
          continue

      if year_filter and str(year_filter) != str(movie_data['release_date'])[:4]:
          continue

      if movie_data["vote_average"] < min_rating:
          continue

      recommendations.append(movie)

    return recommendations[:10]  # Return at most 10 recommendations


def user_interface():
    print("Welcome to the Movie Recommendation System!")

    while True:
        movie_title = input("\nEnter a movie title you like (or 'exit' to quit): ").strip()
        if movie_title.lower() == 'exit':
            print("Thank you for using the Movie Recommendation System. Goodbye!")
            break

        genre_filter = input("Enter a genre to filter by (or press Enter to skip): ").strip()
        year_filter = input("Enter a release year to filter by (or press Enter to skip): ").strip()
        min_rating_str = input("Enter minimum rating (0-10, or press Enter to skip): ").strip()
        min_rating = float(min_rating_str) if min_rating_str and min_rating_str.replace('.', '', 1).isdigit() else 0


        recommendations = recommend_movies(movie_title, item_similarity_matrix, genre_filter, year_filter, min_rating)

        if isinstance(recommendations, str):
            print(recommendations)
        elif recommendations:
            print(f"\nRecommendations for '{movie_title}':")
            for movie in recommendations:
                print(movie)
        else:
            print(f"No recommendations found for '{movie_title}' with your specified filters.")


cleaned_movies_data = preprocess_movies_data(movies_data)
user_item_matrix = create_user_item_matrix(cleaned_movies_data)
item_similarity_matrix = calculate_item_similarity(user_item_matrix)


user_interface()

Welcome to the Movie Recommendation System!

Enter a movie title you like (or 'exit' to quit): Avatar
Enter a genre to filter by (or press Enter to skip): Action
Enter a release year to filter by (or press Enter to skip): 2009
Enter minimum rating (0-10, or press Enter to skip): 8
No recommendations found for 'Avatar' with your specified filters.

Enter a movie title you like (or 'exit' to quit): The Lion King
Enter a genre to filter by (or press Enter to skip): Adventure
Enter a release year to filter by (or press Enter to skip): 1994
Enter minimum rating (0-10, or press Enter to skip): 8.5
No recommendations found for 'The Lion King' with your specified filters.

Enter a movie title you like (or 'exit' to quit): exit
Thank you for using the Movie Recommendation System. Goodbye!


In [None]:
print(movies_data[movies_data['title'] == 'Avatar'])
print(movies_data[movies_data['title'] == 'The Lion King'])


      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                      homepage     id  \
0  http://www.avatarmovie.com/  19995   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   

  original_title                                           overview  \
0         Avatar  In the 22nd century, a paraplegic Marine is di...   

   popularity                               production_companies  \
0  150.437577  [{"name": "Ingenious Film Partners", "id": 289...   

                                production_countries release_date     revenue  \
0  [{"iso_3166_1": "US", "name": "United States o...   10-12-2009  2787965087   

   runtime                                   spoken_languages    status  \
0    162.0  [{"iso_639_1": "en", "name": "English"}, {"iso...  Released   

                       tagline   title 

In [None]:
movies_data['title'] = movies_data['title'].str.strip().str.lower()


In [None]:
title = title.strip().lower()


In [None]:
print(movies_data['genres'].head())  # Check the format of genres


0    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
1    [{"id": 12, "name": "Adventure"}, {"id": 14, "...
2    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
3    [{"id": 28, "name": "Action"}, {"id": 80, "nam...
4    [{"id": 28, "name": "Action"}, {"id": 12, "nam...
Name: genres, dtype: object


In [None]:
import ast

def process_genres(genre_data):
    try:
        genres = ast.literal_eval(genre_data)
        return genres if isinstance(genres, list) else []
    except:
        return []

movies_data['genres'] = movies_data['genres'].apply(process_genres)


In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/movies.csv'
movies_data = pd.read_csv(file_path)

# Inspect the first few rows and columns
print("Columns in the dataset:", movies_data.columns)
print(movies_data.head())


Columns in the dataset: Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count'],
      dtype='object')
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647  

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
from sklearn.model_selection import train_test_split

user_item_matrix = movies_data.pivot_table(index='id', columns='title', values='vote_average').fillna(0)


train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

movie_avg_ratings = train_data.mean()
predictions = test_data.copy()

for column in predictions.columns:
    predictions[column] = movie_avg_ratings[column]
test_data = test_data.fillna(0)

rmse = np.sqrt(mean_squared_error(test_data, predictions))
mae = mean_absolute_error(test_data, predictions)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 0.08965467827314373
MAE: 0.0025368472173846472
