# Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

# Load the dataset

In [None]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [None]:
# view first 2 rows of movies
movies.head(2)

In [None]:
# view first 2 rows of credits
credits.head(2)

In [None]:
# check how many rows and columns avalable in movies dataset
movies.shape

In [None]:
# check how many rows and columns avalable in credits dataset
credits.shape

# Merge Datasets

In [None]:
movies = movies.merge(credits, on='title')

In [None]:
# view first 2 rows of new movies dataset
movies.head(2)

In [None]:
# check how many rows and columns avalable in new movies dataset
movies.shape

# Preprocess the data

In [None]:
# check what are the clomns in new movies dataset
movies.columns

In [None]:
# choose only some needed colomns only
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'vote_count']]

In [None]:
# print new movie dataset
movies

In [None]:
# check no of colomns and rows
movies.shape

Remove missing values

In [None]:
# check missing values
movies.isnull().sum()

In [None]:
# drop missing values
movies.dropna(inplace=True)

In [None]:
# check again missing values in here
movies.isnull().sum()

In [None]:
# check no of colomns and rows (without missing values)
movies.shape

Remove duplicate values

In [None]:
# check duplicated movies in here
movies.duplicated().sum()

Covert Genres

In [None]:
# get first genres
movies.iloc[0]['genres']

In [None]:
# check type of genres
type(movies.iloc[0]['genres'])

In [None]:
import ast # it can convert string to list

# create convert function
def convert(text):
    l = []
    for i in ast.literal_eval(text):
        l.append(i['name'])
        
    return l


In [None]:
# apply the function of genres
movies['genres'] = movies['genres'].apply(convert)

In [None]:
# view first 2 rows of movies
movies.head(2)

Covert Keywords

In [None]:
# get first keywords
movies.iloc[0]['keywords']

In [None]:
# apply the function of keywords
movies['keywords'] = movies['keywords'].apply(convert)

In [None]:
# view first 2 rows of movies
movies.head(2)

Convert Cast

In [None]:
# get first cast
movies.iloc[0]['cast']

In [None]:
# create convert_cast function
def convert_cast(text):
    l = []
    counter = 0
    for i in ast.literal_eval(text):
        if counter < 3:
            l.append(i['name'])
        counter+=1
    return l

In [None]:
# apply the function of cast
movies['cast'] = movies['cast'].apply(convert_cast)

In [None]:
# view first 2 rows of movies
movies.head(2)

Covert Crew

In [None]:
# get first crew
movies.iloc[0]['crew']

In [None]:
# create fetch_directory function
def fetch_directory(text):
    l = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
        
    return l

In [None]:
# apply the function of crew
movies['crew'] = movies['crew'].apply(fetch_directory)

In [None]:
# view first 2 rows of movies
movies.head(2)

Data spliting

In [None]:
# get first overview
movies.iloc[0]['overview']

In [None]:
#split data from the 'overview' column
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [None]:
# view first 2 rows of movies
movies.head(2)

In [None]:
# get first overview
movies.iloc[0]['overview']

In [None]:
# view first few rows
movies.head()

Remove Spaces 

In [None]:
# create remove_space function
def remove_space(word):
    l = []
    for i in word:
        l.append(i.replace(" ",""))
    return l

In [None]:
# call remove_space function
movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

In [None]:
# view first few rows
movies.head()

In [None]:
#Concatenate the columns & create a new column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
# view first few rows
movies.head()

In [None]:
movies.iloc[0]['tags']

In [None]:
# create new data frame
new_df = movies[['movie_id', 'title', 'tags', 'vote_count']]

In [None]:
# view fist few rows of new data frame
new_df.head()

In [None]:
# converts tage coloumn to the string format
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(map(str, x)) if isinstance(x, list) else str(x))

In [None]:
# view first few rows
new_df.head()

In [None]:
# access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# convert the tags colomn to the lowercase
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

In [None]:
# view first few rows
new_df.head()

In [None]:
# again access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# veiw first few rows of new_df
new_df.head()

# Generating Embeddings

In [None]:
# import libraries 
import nltk
from nltk.stem import PorterStemmer

In [None]:
# create an instance of the Porter Stemmer(tool of stemming words)
ps = PorterStemmer()

In [None]:
# create stem function
def stem(text):
    l = []
    for i in text.split():
        l.append(ps.stem(i))
        
    return " ".join(l)

In [None]:
# applying stemming function
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
# access the tags colomn in the new_df data frame
new_df.iloc[0]['tags']

In [None]:
# call CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')  # create instanse 

In [None]:
# convert 'tags' into a numerical vector
vector = cv.fit_transform(new_df['tags']).toarray()

In [None]:
# call the vector
vector

In [None]:
# representing the number of rows and columns in the array
vector.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# calculate the cosine similarity between vectors
similary = cosine_similarity(vector)

In [None]:
similary

In [None]:
# returns a tuple of (number of rows, number of columns)
similary.shape

In [None]:
# retrieve the index of the row
new_df[new_df['title'] == 'Spider-Man'].index[0]

In [None]:
# create pickle files
# import pickle

# pickle.dump(new_df, open('artificats/movie_list.pkl', 'wb'))
# pickle.dump(similary, open('artificats/similarity.pkl', 'wb'))

# View Recommended Movies

In [None]:
# initialize an empty list 'viewing_history'
viewing_history = []

In [None]:
# funtion of recommende movie based on title
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]         # get the index
    distances = sorted(list(enumerate(similary[index])), reverse=True, key=lambda x: x[1])  # calculate distances based on similarity score
    recommended_movies = []   # inialize the list
    for i in distances[1:6]:  # find top 5 similar movies
        recommended_movie = new_df.iloc[i[0]].title
        recommended_movies.append(recommended_movie)
        print(recommended_movie)
    
    # Add recommended movies
    viewing_history.append({
        'query_movie': movie,
        'recommended_movies': recommended_movies
    })

Call the recommend function

In [None]:
recommend('Spider-Man')

In [None]:
recommend('The Dark Knight Rises')

In [None]:
recommend('Harry Potter and the Half-Blood Prince')

In [None]:
recommend('The Avengers')

In [None]:
recommend('Robin Hood')

# Get Accuracy of Recommended Movies

In [None]:
# create calculate accuracy function
def calculate_accuracy(ground_truth, recommendations):    # ground_truth and recommendations are lists of recommended movies
    common_recommendations = set(ground_truth).intersection(recommendations)
    accuracy = len(common_recommendations) / len(recommendations)
    return accuracy

In [None]:
# create a function to compile ground truth movies from viewing history
def view_history_ground_truth_movies():
    ground_truth_movies = []
    for record in viewing_history:
        recommended_movies = record['recommended_movies']
        ground_truth_movies.extend(recommended_movies)
    return ground_truth_movies

# get the ground truth movies from the viewing history
ground_truth_movies = view_history_ground_truth_movies()

# Calculate accuracy using ground truth movies
accuracy = calculate_accuracy(ground_truth_movies, viewing_history[-1]['recommended_movies'])  # Pass the last set of recommended movies
print(f"Accuracy: {accuracy}")

Also get Precision, Recall and F1_Score.

In [None]:
# function to calculate precision, recall, and F1 score
def calculate_metrics(ground_truth, recommendations):
    true_positives = len(set(ground_truth).intersection(recommendations))
    false_positives = len(recommendations) - true_positives
    false_negatives = len(ground_truth) - true_positives
    
    # calculate precision
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    
    # calculate recall
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

# calculate precision, recall, and F1 score
precision, recall, f1_score = calculate_metrics(ground_truth_movies, viewing_history[-1]['recommended_movies'])

# print the results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


# View History of the recommened movies

In [None]:
# create function to view history
def view_history():
    for record in viewing_history:
        print(f"Query Movie: {record['query_movie']}")
        print("Recommended Movies:")
        for movie in record['recommended_movies']:
            print(f" - {movie}")
        print("\n")


In [None]:
# create function to view history
def view_history():
    for record in viewing_history:
        print(f"Query Movie: {record['query_movie']}")
        print("Recommended Movies:")
        for movie in record['recommended_movies']:
            print(f" - {movie}")
        print("\n")

In [None]:
# call the viweing history function
view_history()

# Recommended Movies based on preferences

In [None]:
def get_user_preferences():
    # user input of preferencese movies
    print("Please enter your movie preferences (comma-separated):")
    user_input = input()  # get user input
    preferences = [preference.strip() for preference in user_input.split(',')]  # split the input and remove spaces
    return preferences

In [None]:
def recommend_by_preferences(preferences, similarity):  # Pass similarity as an argument
    # Convert to lowercase and stem
    preferences = [ps.stem(p.lower()) for p in preferences]

    # Find movies that match the user preferences
    matching_movies = new_df[new_df['tags'].str.contains('|'.join(preferences))]

    # Get recommendations based on the matching movies
    recommended_movies = []
    for movie in matching_movies['title']:
        index = new_df[new_df['title'] == movie].index[0]
        distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
        for i in distances[1:6]:
            recommended_movie = new_df.iloc[i[0]].title
            recommended_movies.append(recommended_movie)

    return list(set(recommended_movies))  # Removing duplicates and returning unique recommendations

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

# Ask the user for their movie preferences
user_preferences = get_user_preferences()

# Get recommendations based on user preferences
recommended_movies = recommend_by_preferences(user_preferences, similarity)

print()
print()

# Print the recommended movies based on preferences
print("Recommended Movies based on preferences:")
for movie in recommended_movies:
    print(movie)

# Get Accuracy of Recommended Movies based on preferences.

In [None]:
# function to calculate accuracy of based on preferences
def calculate_paccuracy(ground_truth, recommendations):
    common_recommendations = set(ground_truth).intersection(recommendations)  # find the commons
    paccuracy = len(common_recommendations) / len(recommendations) if len(recommendations) > 0 else 0    # calculate accuracy
    return paccuracy

# Ask the user for their movie preferences
user_preferences = get_user_preferences()

# Get recommendations based on user preferences
recommended_movies = recommend_by_preferences(user_preferences, similarity)

# Calculate accuracy
paccuracy = calculate_paccuracy(recommended_movies, recommended_movies)

# Print the accuracy
print(f"Accuracy: {paccuracy}")


Also get Precision, Recall, F1_Score.

In [None]:
# function to calculate precision, recall, and F1 score on preferences
def calculate_metrics(ground_truth, recommendations):
    true_positives = len(set(ground_truth).intersection(recommendations))
    false_positives = len(recommendations) - true_positives
    false_negatives = len(ground_truth) - true_positives
    
    # calculate precision
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    
    # calculate recall
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

# Ask the user for their movie preferences
user_preferences = get_user_preferences()

# Get recommendations based on user preferences
recommended_movies = recommend_by_preferences(user_preferences, similarity)

# calculate precision, recall, and F1 score
precision, recall, f1_score = calculate_metrics(recommended_movies, recommended_movies)

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


# View history based on preferences

Save to the History of Recommended Movies based on preferences.

In [None]:
phistory = []  # Initialize an empty list to store the history of preference based recommended movies

In [None]:
def save_to_phistory(recommended_movies, phistory):
    # Append the recommended movies
    phistory.extend(recommended_movies)

# Ask the user for their movie preferences
user_preferences = get_user_preferences()

# Get recommendations based on user preferences
recommended_movies = recommend_by_preferences(user_preferences, similarity)

print()
print()

# Print the recommended movies based on preferences
print("Recommended Movies based on preferences:")
for movie in recommended_movies:
    print(movie)

# Save recommended movies to preferences history
save_to_phistory(recommended_movies, phistory)


# Recommended Movies based on credits

In [None]:
def recommend_by_credits(movie, similarity): 
    # Check if the movie is in the DataFrame
    if movie not in new_df['title'].values:
        print(f"The movie '{movie}' is not found in the database.")
        return []

    index = new_df[new_df['title'] == movie].index[0]  # get the index in data frame
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1]) # calculate similarity scores
    recommended_movies = []
    for i in distances[1:6]:
        recommended_movie = new_df.iloc[i[0]].title
        recommended_movies.append(recommended_movie)

    return recommended_movies  # return recommendation movies based on credits

In [None]:
# user input for enter a movie title
movie_to_get_credit_recommendations = input("Please enter a movie title to get credit-based recommendations: ")

# Get recommendations based on movie credits
recommended_movies_by_credits = recommend_by_credits(movie_to_get_credit_recommendations, similarity)

if recommended_movies_by_credits:
    # Print the recommended movies based on credits
    print(f"\nRecommended Movies based on credits for {movie_to_get_credit_recommendations}:")
    for movie in recommended_movies_by_credits:
        print(movie)

# View history based on credits

Save to the History of Recommended Movies based on credits

In [None]:
# initialize an empty list to store the history of credits based recommended movies 
credit_history = []

In [None]:
def creditHistory(recommended_movies):
    # extend the 'credit_history' list with the list of recommended movies
    credit_history.extend(recommended_movies)

# user input for enter a movie title
movie_to_get_credit_recommendations = input("Please enter a movie title to get credit-based recommendations: ")

# Get recommendations based on movie credits
recommended_movies_by_credits = recommend_by_credits(movie_to_get_credit_recommendations, similarity)

if recommended_movies_by_credits:
    # Print the recommended movies based on credits
    print(f"\nRecommended Movies based on credits for {movie_to_get_credit_recommendations}:")
    for movie in recommended_movies_by_credits:
        print(movie)
    
    # Save recommended movies to credit history
    creditHistory(recommended_movies_by_credits)
    

In [None]:
# call credit_history function
credit_history

# Get Accuracy of Recommended Movies based on credits.

In [None]:
# prompt the user to enter a movie title
movie_to_get_credit_recommendations = input("Please enter a movie title to get credit-based recommendations: ")

# Get recommendations based on movie credits
recommended_movies_by_credits = recommend_by_credits(movie_to_get_credit_recommendations, similarity)

# Calculate accuracy
accuracy = calculate_accuracy(recommended_movies_by_credits, recommended_movies_by_credits)

# Print the accuracy
print(f"Accuracy: {accuracy}")


In [None]:
# function to calculate precision, recall, and F1 score
def calculate_metrics(ground_truth, recommendations):
    true_positives = len(set(ground_truth).intersection(recommendations))
    false_positives = len(recommendations) - true_positives
    false_negatives = len(ground_truth) - true_positives
    
    # calculate precision
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    
    # calculate recall
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    
    # calculate F1_score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score


# Ask the user for their movie preferences
movie_to_get_credit_recommendations = input("Please enter a movie title to get credit-based recommendations: ")

# Get recommendations based on movie credits
recommended_movies_by_credits = recommend_by_credits(movie_to_get_credit_recommendations, similarity)

# Calculate metrics
precision, recall, f1_score = calculate_metrics(recommended_movies_by_credits, recommended_movies_by_credits)

# Print the metrics
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")


# Recommended Movies based on Vote_Count

In [None]:
# function of recommended movies based on vote count
def recommend_movies_by_vote_count(movie_title, movies):
    # Filter movies based on the entered title
    selected_movie = movies[movies['title'] == movie_title]

    if not selected_movie.empty:
        # Sort movies by descending order of vote_count
        recommended_by_vote_count = movies.sort_values(by='vote_count', ascending=False)

        # Get the top 5 movies with highest vote_count
        recommended_by_vote_count = recommended_by_vote_count.head(5)
        
        print()

        # Print the recommended movies based on vote_count
        print(f"Recommended Movies based on vote_count for {movie_title}:")
        for movie_title in recommended_by_vote_count['title']:
            print(movie_title)
    else:
        print(f"Movie '{movie_title}' not found in the dataset.")

movie_title_input = input("Enter a movie title: ")
recommend_movies_by_vote_count(movie_title_input, movies)
