In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer()

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["overview"])

# Calculate the cosine similarity between all movies
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


MemoryError: Unable to allocate 7.04 GiB for an array with shape (1891078041,) and data type int32

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Drop rows with missing values in the 'overview' column
movies_df.dropna(subset=['overview'], inplace=True)

# Subset the dataset to include only the necessary columns
movies_subset = movies_df[['original_title', 'overview']]

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' text into TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset['overview'])

# Calculate the cosine similarity between all movies
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title, cosine_similarities, movies_subset):
    # Get the index of the movie in the dataset
    movie_index = movies_subset[movies_subset['original_title'] == movie_title].index[0]
    
    # Get the similarity scores for the movie
    similarity_scores = cosine_similarities[movie_index]
    
    # Get the indices of the most similar movies
    similar_movie_indices = similarity_scores.argsort()[::-1][1:]
    
    # Get the titles of the similar movies
    similar_movies = movies_subset.loc[similar_movie_indices, 'original_title']
    
    return similar_movies

# Example usage
movie_title = "Toy Story"
recommendations = get_recommendations(movie_title, cosine_similarities, movies_subset)
print(recommendations)


MemoryError: Unable to allocate 4.07 GiB for an array with shape (546860044,) and data type float64

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Load the dataset
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str})

# Drop rows with missing values in the 'overview' column
movies_df.dropna(subset=['overview'], inplace=True)

# Subset the dataset to include only the necessary columns
movies_subset = movies_df[['original_title', 'overview']]

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the 'overview' text into TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset['overview'])

# Reduce the dimensionality of the TF-IDF matrix using Truncated SVD
svd = TruncatedSVD(n_components=100)
svd_matrix = svd.fit_transform(tfidf_matrix)

# Calculate the cosine similarity between all movies
cosine_similarities = cosine_similarity(svd_matrix, svd_matrix)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title, cosine_similarities, movies_subset):
    # Get the index of the movie in the dataset
    movie_index = movies_subset[movies_subset['original_title'] == movie_title].index[0]
    
    # Get the similarity scores for the movie
    similarity_scores = cosine_similarities[movie_index]
    
    # Get the indices of the most similar movies
    similar_movie_indices = similarity_scores.argsort()[::-1][1:]
    
    # Get the titles of the similar movies
    similar_movies = movies_subset.loc[similar_movie_indices, 'original_title']
    
    return similar_movies

# Example usage
movie_title = "Toy Story"
recommendations = get_recommendations(movie_title, cosine_similarities, movies_subset)
print(recommendations)


  movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str})


MemoryError: Unable to allocate 14.8 GiB for an array with shape (44512, 44512) and data type float64

In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Fit the TfidfVectorizer on the movie overviews
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df["overview"])

# Calculate the cosine similarity between all movies
cosine_similarities = cosine_similarity(tfidf_matrix)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


MemoryError: Unable to allocate 2.04 GiB for an array with shape (546860044,) and data type int32

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty cosine similarity matrix
num_movies = len(movies_df)
cosine_similarities = None

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, num_movies, batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
    
    if cosine_similarities is None:
        cosine_similarities = cosine_similarity(tfidf_matrix)
    else:
        cosine_similarities = vstack((cosine_similarities, cosine_similarity(tfidf_matrix)))

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: blocks must be 2-D

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty cosine similarity matrix
num_movies = len(movies_df)
cosine_similarities = None

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, num_movies, batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
    
    if cosine_similarities is None:
        cosine_similarities = cosine_similarity(tfidf_matrix)
    else:
        cosine_similarities = vstack((cosine_similarities, cosine_similarity(tfidf_matrix)))

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: blocks must be 2-D

In [16]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
    cosine_similarities = cosine_similarity(tfidf_matrix)
    cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=0)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1000 and the array at index 45 has size 466

In [17]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
    cosine_similarities = cosine_similarity(tfidf_matrix)
    cosine_similarities_list.append(cosine_similarities)

# Handle the last batch separately
last_batch_movies = movies_df["overview"][i+batch_size:]
last_batch_tfidf_matrix = tfidf_vectorizer.fit_transform(last_batch_movies)
last_batch_cosine_similarities = cosine_similarity(last_batch_tfidf_matrix)
cosine_similarities_list.append(last_batch_cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=0)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: empty vocabulary; perhaps the documents only contain stop words

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=0)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1000 and the array at index 45 has size 466

In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=0)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 1000 and the array at index 45 has size 466

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=1)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1000 and the array at index 45 has size 466

In [21]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = vstack(cosine_similarities_list)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: blocks[:,0] has incompatible row dimensions. Got blocks[45,0].shape[1] == 466, expected 1000.

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        cosine_similarities_list.append(cosine_similarities)

# Concatenate the cosine similarity matrices
cosine_similarities = np.concatenate(cosine_similarities_list, axis=1)

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 1000 and the array at index 45 has size 466

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        
        # Pad the cosine similarity matrix with zeros to match the maximum number of columns
        if cosine_similarities_list:
            max_columns = max(cosine_similarities_list, key=lambda x: x.shape[1]).shape[1]
        else:
            max_columns = 0
        
        if cosine_similarities.shape[1] < max_columns:
            padding = np.zeros((cosine_similarities.shape[0], max_columns - cosine_similarities.shape[1]))
            cosine_similarities = np.hstack((cosine_similarities, padding))
        
        cosine_similarities_list.append(cosine_similarities)

# Stack the cosine similarity matrices vertically along the rows
cosine_similarities = vstack(cosine_similarities_list, format='csr')

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Get the index of the movie in the dataset
    movie_index = movies_df[movies_df["original_title"] == movie_title].index[0]
    
    # Get the cosine similarities for the movie
    movie_similarities = cosine_similarities[movie_index, :]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(movie_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" column and setting low_memory to False
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str}, low_memory=False)

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        
        # Pad the cosine similarity matrix with zeros to match the maximum number of columns
        if cosine_similarities_list:
            max_columns = max(cosine_similarities_list, key=lambda x: x.shape[1]).shape[1]
        else:
            max_columns = 0
        
        if cosine_similarities.shape[1] < max_columns:
            padding = np.zeros((cosine_similarities.shape[0], max_columns - cosine_similarities.shape[1]))
            cosine_similarities = np.hstack((cosine_similarities, padding))
        
        cosine_similarities_list.append(cosine_similarities)

# Stack the cosine similarity matrices vertically along the rows
cosine_similarities = vstack(cosine_similarities_list, format='csr')

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Preprocess the movie title
    processed_title = tfidf_vectorizer.transform([movie_title])
    
    # Get the cosine similarities for the movie title
    title_similarities = cosine_similarity(processed_title, tfidf_matrix)[0]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(title_similarities)[::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Toy Story"
recommendations = get_recommendations("Toy Story")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])


Rent-a-Kid
Circle of Friends
Heaven & Earth
Nadja
The Amazing Panda Adventure
The Browning Version
Love Affair
Guilty as Sin
صمت القصور
Babe


In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import vstack

# Load the movie dataset, specifying the data type for the "overview" and "genres" columns
movies_df = pd.read_csv("movies_metadata.csv", dtype={"overview": str, "genres": str}, low_memory=False)

# Load the keywords dataset
keywords_df = pd.read_csv("keywords.csv", dtype={"id": str, "keywords": str}, low_memory=False)

# Merge the movie dataset with the keywords dataset based on movie ID
movies_df = pd.merge(movies_df, keywords_df, on="id", how="left")

# Create a TfidfVectorizer object with stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the movie descriptions
movies_df["overview"] = movies_df["overview"].fillna("")  # Handling missing values

# Extract genre information
movies_df["genres"] = movies_df["genres"].fillna("[]")  # Handling missing values
movies_df["genres"] = movies_df["genres"].apply(lambda x: [genre["name"] for genre in eval(x)])

# Extract keywords information
movies_df["keywords"] = movies_df["keywords"].fillna("[]")  # Handling missing values
movies_df["keywords"] = movies_df["keywords"].apply(lambda x: [keyword["name"] for keyword in eval(x)])

# Remove movies with missing or invalid genre information
movies_df = movies_df[movies_df["genres"].apply(lambda x: len(x) > 0)]

# Initialize an empty list to store cosine similarity matrices
cosine_similarities_list = []

# Batch size for memory-efficient computation
batch_size = 1000

# Calculate the cosine similarity between all movies in batches
for i in range(0, len(movies_df), batch_size):
    batch_movies = movies_df["overview"][i:i+batch_size]
    
    # Check if the batch has non-empty documents
    if not batch_movies.empty:
        tfidf_matrix = tfidf_vectorizer.fit_transform(batch_movies)
        cosine_similarities = cosine_similarity(tfidf_matrix)
        
        # Pad the cosine similarity matrix with zeros to match the maximum number of columns
        if cosine_similarities_list:
            max_columns = max(cosine_similarities_list, key=lambda x: x.shape[1]).shape[1]
        else:
            max_columns = 0
        
        if cosine_similarities.shape[1] < max_columns:
            padding = np.zeros((cosine_similarities.shape[0], max_columns - cosine_similarities.shape[1]))
            cosine_similarities = np.hstack((cosine_similarities, padding))
        
        cosine_similarities_list.append(cosine_similarities)

# Stack the cosine similarity matrices vertically along the rows
cosine_similarities = vstack(cosine_similarities_list, format='csr')

# Get the movie recommendations for a given movie
def get_recommendations(movie_title):
    """Returns a list of movie recommendations for a given movie title."""
    # Preprocess the movie title
    processed_title = tfidf_vectorizer.transform([movie_title])
    
    # Get the cosine similarities for the movie title
    title_similarities = cosine_similarity(processed_title, tfidf_matrix)[0]
    
    # Sort the movies by their cosine similarity
    sorted_movies = np.argsort(title_similarities)[::-1]
    
    # Filter movies by genre similarity (consider top 50 similar movies)
    top_similar_movies = sorted_movies[:50]
    movie_genre = movies_df.loc[movies_df["original_title"] == movie_title, "genres"].values[0]
    
    genre_similarities = []
    for movie in top_similar_movies:
        if isinstance(movies_df.loc[movie, "genres"], list):
            movie_genre_similarity = len(set(movie_genre) & set(movies_df.loc[movie, "genres"])) / max(1, len(set(movie_genre)))
            genre_similarities.append(movie_genre_similarity)
    
    genre_similarities = np.array(genre_similarities)
    
    # Sort the movies by their genre similarity (descending order)
    genre_sorted_movies = top_similar_movies[np.argsort(genre_similarities)][::-1]
    
    # Filter movies by keyword similarity (consider top 20 similar movies)
    top_similar_movies = genre_sorted_movies[:20]
    movie_keywords = movies_df.loc[movies_df["original_title"] == movie_title, "keywords"].values[0]
    
    keyword_similarities = []
    for movie in top_similar_movies:
        if isinstance(movies_df.loc[movie, "keywords"], list) and movie_keywords:
            movie_keyword_similarity = len(set(movie_keywords) & set(movies_df.loc[movie, "keywords"])) / max(1, len(set(movie_keywords)))
            keyword_similarities.append(movie_keyword_similarity)
    
    keyword_similarities = np.array(keyword_similarities)
    
    # Sort the movies by their keyword similarity (descending order)
    keyword_sorted_movies = top_similar_movies[np.argsort(keyword_similarities)][::-1]
    
    # Get the top 10 movie recommendations (excluding the movie itself)
    recommendations = keyword_sorted_movies[1:11]
    
    return recommendations

# Example usage:
# Get the movie recommendations for "Lucy"
recommendations = get_recommendations("Lucy")

# Print the movie recommendations
for movie in recommendations:
    print(movies_df["original_title"][movie])
