Content based

In [6]:
import os
import pandas as pd
import numpy as np

script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

# Load ratings data
movies_file = os.path.join(script_dir, "Cleaned Datasets", "Final_Movie_Data.tsv")
df_movie = pd.read_csv(movies_file, delimiter='\t') 

display(df_movie.head(5))

Current working directory: c:\Users\willi\OneDrive\Documents\GitHub\Movie-Recommendations


Unnamed: 0,tconst,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0004972,0,1915,195,"Drama,War",6.1,nm0000428,"nm0228746,nm0000428,nm0940488,nm0934306,nm1628...",
1,tt0006333,0,1916,105,"Action,Adventure,Sci-Fi",6.1,nm0665737,"nm0894523,nm0665737",
2,tt0006864,0,1916,163,"Drama,History",7.7,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",
3,tt0010040,0,1919,85,"Comedy,Drama",6.6,nm0624714,"nm0916914,nm0426515",
4,tt0010323,0,1920,67,"Horror,Mystery,Thriller",8.0,nm0927468,"nm0562346,nm0417917",


In [None]:
# Display dataset information
print("Dataset shape:", df_movie.shape)
print("\nColumns:", df_movie.columns.tolist())
print("\nSample data:")
display(df_movie.head())

# Check for missing values
print("\nMissing values per column:")
print(df_movie.isnull().sum())

Dataset shape: (8902, 9)

Columns: ['tconst', 'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'directors', 'writers', 'tags']

Sample data:


Unnamed: 0,tconst,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0004972,0,1915,195,"Drama,War",6.1,nm0000428,"nm0228746,nm0000428,nm0940488,nm0934306,nm1628...",
1,tt0006333,0,1916,105,"Action,Adventure,Sci-Fi",6.1,nm0665737,"nm0894523,nm0665737",
2,tt0006864,0,1916,163,"Drama,History",7.7,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",
3,tt0010040,0,1919,85,"Comedy,Drama",6.6,nm0624714,"nm0916914,nm0426515",
4,tt0010323,0,1920,67,"Horror,Mystery,Thriller",8.0,nm0927468,"nm0562346,nm0417917",



Missing values per column:
tconst               0
isAdult              0
startYear            0
runtimeMinutes       0
genres               0
averageRating        0
directors            0
writers            115
tags              7378
dtype: int64


In [None]:
# Create a function to combine relevant features into a single text representation
def create_content_features(row):
    # Convert numeric values to strings and handle potential NaN values
    directors = str(row['directors']) if pd.notna(row['directors']) else ''
    writers = str(row['writers']) if pd.notna(row['writers']) else ''
    genres = str(row['genres']) if pd.notna(row['genres']) else ''
    year = str(row['startYear']) if pd.notna(row['startYear']) else ''
    
    # Create runtime category
    if pd.notna(row['runtimeMinutes']):
        if row['runtimeMinutes'] > 120:
            runtime = 'long'
        elif row['runtimeMinutes'] > 90:
            runtime = 'medium'
        else:
            runtime = 'short'
    else:
        runtime = ''
    
    # Create rating category
    if pd.notna(row['averageRating']):
        if row['averageRating'] >= 7.5:
            rating = 'highly_rated'
        elif row['averageRating'] >= 6.5:
            rating = 'moderately_rated'
        else:
            rating = 'average_rated'
    else:
        rating = ''
    
    # Combine all features with repetition for important features
    return f"{genres} {genres} {directors} {writers} {year} {runtime} {rating}"

# Apply the function to create a new column with combined features
df_movie['content_features'] = df_movie.apply(create_content_features, axis=1)

# Display sample of content features
display(df_movie[['tconst', 'content_features']].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors from the content features
print("Creating TF-IDF matrix...")
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movie['content_features'].fillna(''))

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {len(tfidf.get_feature_names_out())}")

Calculate cosine similarity between all movies
Note: For large datasets, this can be memory-intensive
If memory issues occur, we'll implement a more efficient approach in later cells

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Use a sample if the dataset is very large (optional)
sample_size = min(10000, len(df_movie))  # Adjust based on your system's memory

if len(df_movie) > sample_size:
    # Randomly sample movies to reduce computation
    print(f"Using a sample of {sample_size} movies for similarity calculation")
    sample_indices = np.random.choice(len(df_movie), size=sample_size, replace=False)
    tfidf_sample = tfidf_matrix[sample_indices]
    movie_indices = df_movie.iloc[sample_indices].index
    cosine_sim = cosine_similarity(tfidf_sample, tfidf_sample)
else:
    print("Calculating similarity for all movies...")
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
    movie_indices = df_movie.index

print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

Recommends movies similar to the given movie_id.

Parameters:
- movie_id (str): The ID of the movie to find recommendations for
- cosine_sim_matrix (numpy.ndarray): Cosine similarity matrix
- df (pandas.DataFrame): DataFrame containing movie data
- indices (list, optional): Indices of movies in the cosine_sim_matrix
- top_n (int): Number of recommendations to return
    
Returns:
- pandas.DataFrame: Top N recommended movies


In [None]:
# Function to get movie recommendations
def get_recommendations(movie_id, cosine_sim_matrix, df, indices=None, top_n=10):

    # Get the index of the movie in our dataframe
    if indices is not None:
        # When using a sample of movies
        if movie_id not in df.loc[indices, 'tconst'].values:
            print(f"Movie {movie_id} not in the sample. Try another movie ID.")
            return pd.DataFrame()
        idx = df.loc[indices].index[df.loc[indices, 'tconst'] == movie_id].tolist()[0]
        # Map the index to position in cosine_sim_matrix
        idx_pos = np.where(indices == idx)[0][0]
    else:
        # When using all movies
        if movie_id not in df['tconst'].values:
            print(f"Movie {movie_id} not found in the dataset.")
            return pd.DataFrame()
        idx = df.index[df['tconst'] == movie_id].tolist()[0]
        idx_pos = idx
    
    # Get similarity scores for all movies with the target movie
    sim_scores = list(enumerate(cosine_sim_matrix[idx_pos]))
    
    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top_n most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of recommended movies
    if indices is not None:
        # When using a sample
        movie_positions = [i[0] for i in sim_scores]
        recommended_indices = [indices[pos] for pos in movie_positions]
    else:
        # When using all movies
        recommended_indices = [i[0] for i in sim_scores]
    
    # Return the top N movies
    columns_to_return = ['tconst', 'genres', 'startYear', 'averageRating', 'runtimeMinutes', 'directors']
    columns_to_return = [col for col in columns_to_return if col in df.columns]
    
    return df.iloc[recommended_indices][columns_to_return]

In [None]:
# Example: Get recommendations for a specific movie
example_movie_id = df_movie['tconst'].iloc[0]  # Use the first movie as an example
print(f"Finding recommendations for movie: {example_movie_id}")

# Display information about the selected movie
movie_info = df_movie[df_movie['tconst'] == example_movie_id].iloc[0]
print(f"\nSelected movie details:")
print(f"Title genres: {movie_info.get('genres', 'N/A')}")
print(f"Year: {movie_info.get('startYear', 'N/A')}")
print(f"Rating: {movie_info.get('averageRating', 'N/A')}")

# Get recommendations
if len(df_movie) > sample_size:
    recommendations = get_recommendations(example_movie_id, cosine_sim, df_movie, movie_indices)
else:
    recommendations = get_recommendations(example_movie_id, cosine_sim, df_movie)

print("\nTop recommended movies:")
display(recommendations)

In [None]:
# Create a function to find similar movies for any user input
def find_similar_movies(movie_id, df=df_movie, tfidf_matrix=tfidf_matrix, top_n=10):
    """
    Interactive function to find movies similar to the given movie ID.
    More efficient for large datasets as it calculates similarity on-demand.
    """
    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return pd.DataFrame()
    
    # Get the movie index
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get the TF-IDF vector for the selected movie
    movie_vector = tfidf_matrix[idx:idx+1]
    
    # Calculate similarity with all movies
    sim_scores = cosine_similarity(movie_vector, tfidf_matrix).flatten()
    
    # Get indices of top similar movies (excluding itself)
    sim_indices = sim_scores.argsort()[:-(top_n+1):-1][1:]
    
    # Display information about the selected movie
    movie_info = df[df['tconst'] == movie_id].iloc[0]
    print(f"\nSelected movie details:")
    print(f"Genres: {movie_info.get('genres', 'N/A')}")
    print(f"Year: {movie_info.get('startYear', 'N/A')}")
    print(f"Rating: {movie_info.get('averageRating', 'N/A')}")
    
    # Return the top N movies
    columns_to_return = ['tconst', 'genres', 'startYear', 'averageRating', 'runtimeMinutes', 'directors']
    columns_to_return = [col for col in columns_to_return if col in df.columns]
    
    return df.iloc[sim_indices][columns_to_return]

# Example: Try with another movie ID
another_movie_id = 'tt0010323'  # Replace with any movie ID from your dataset
print(f"Finding recommendations for movie: {another_movie_id}")

# Get recommendations
recommendations = find_similar_movies(another_movie_id)

print("\nTop recommended movies:")
display(recommendations)

In [None]:
# Analyze what features are most important for a specific movie
def analyze_movie_features(movie_id, df=df_movie, tfidf=tfidf, tfidf_matrix=tfidf_matrix, top_n=10):
    """
    Analyzes the most important features for a specific movie.
    """
    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return
    
    # Get the movie index
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get the TF-IDF vector for the selected movie
    movie_vector = tfidf_matrix[idx].toarray()[0]
    
    # Get feature names
    feature_names = tfidf.get_feature_names_out()
    
    # Create a dictionary of feature importance
    feature_importance = {feature_names[i]: movie_vector[i] 
                         for i in range(len(feature_names)) 
                         if movie_vector[i] > 0}
    
    # Sort by importance
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    
    # Display movie info
    movie_info = df[df['tconst'] == movie_id].iloc[0]
    print(f"Feature analysis for movie: {movie_id}")
    print(f"Genres: {movie_info.get('genres', 'N/A')}")
    print(f"Year: {movie_info.get('startYear', 'N/A')}")
    print(f"Rating: {movie_info.get('averageRating', 'N/A')}")
    
    # Print the top features
    print(f"\nTop {top_n} most important features:")
    for feature, importance in sorted_features[:top_n]:
        print(f"  {feature}: {importance:.4f}")

# Example: Analyze a movie's features
analyze_movie_id = 'tt0004972'  # Replace with any movie ID from your dataset
analyze_movie_features(analyze_movie_id)