Content based

In [73]:
import os
import pandas as pd
import numpy as np

script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

# Load ratings data
movies_file = os.path.join(script_dir, "Cleaned Datasets", "Final_Movie_Data.tsv")
df_movie = pd.read_csv(movies_file, delimiter='\t') 

display(df_movie.head(5))

Current working directory: c:\Users\willi\OneDrive\Documents\GitHub\Movie-Recommendations


Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0004972,The Birth of a Nation,0,1915,195,"Drama,War",6.1,nm0000428,"nm0228746,nm0000428,nm0940488,nm0934306,nm1628...",
1,tt0006333,"20,000 Leagues Under the Sea",0,1916,105,"Action,Adventure,Sci-Fi",6.1,nm0665737,"nm0894523,nm0665737",
2,tt0006864,Intolerance,0,1916,163,"Drama,History",7.7,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",
3,tt0010040,Daddy-Long-Legs,0,1919,85,"Comedy,Drama",6.6,nm0624714,"nm0916914,nm0426515",
4,tt0010323,The Cabinet of Dr. Caligari,0,1920,67,"Horror,Mystery,Thriller",8.0,nm0927468,"nm0562346,nm0417917",


In [74]:
# Display dataset information
print("Dataset shape:", df_movie.shape)
print("\nColumns:", df_movie.columns.tolist())
print("\nSample data:")
display(df_movie.head())

# Check for missing values
print("\nMissing values per column:")
print(df_movie.isnull().sum())

Dataset shape: (8902, 10)

Columns: ['tconst', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'directors', 'writers', 'tags']

Sample data:


Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0004972,The Birth of a Nation,0,1915,195,"Drama,War",6.1,nm0000428,"nm0228746,nm0000428,nm0940488,nm0934306,nm1628...",
1,tt0006333,"20,000 Leagues Under the Sea",0,1916,105,"Action,Adventure,Sci-Fi",6.1,nm0665737,"nm0894523,nm0665737",
2,tt0006864,Intolerance,0,1916,163,"Drama,History",7.7,nm0000428,"nm0048512,nm0115218,nm0000428,nm0002616,nm0640...",
3,tt0010040,Daddy-Long-Legs,0,1919,85,"Comedy,Drama",6.6,nm0624714,"nm0916914,nm0426515",
4,tt0010323,The Cabinet of Dr. Caligari,0,1920,67,"Horror,Mystery,Thriller",8.0,nm0927468,"nm0562346,nm0417917",



Missing values per column:
tconst               0
primaryTitle         0
isAdult              0
startYear            0
runtimeMinutes       0
genres               0
averageRating        0
directors            0
writers            115
tags              7378
dtype: int64


This function creates a single text-based feature that combines multiple movie attributes (genres, directors, writers, year, runtime, rating and tags categories). It processes each row and builds a string that represents the movie in a way that can be used for content-based filtering.

In [75]:
# Function to combine relevant features into a single text representation
def create_content_features(row):
    # Convert numeric values to strings and handle potential NaN values
    directors = str(row['directors']) if pd.notna(row['directors']) else ''
    writers = str(row['writers']) if pd.notna(row['writers']) else ''
    genres = str(row['genres']) if pd.notna(row['genres']) else ''
    year = str(row['startYear']) if pd.notna(row['startYear']) else ''
    tags = str(row['tags']) if pd.notna(row['tags']) else ''
    
    # Create runtime category
    if pd.notna(row['runtimeMinutes']):
        if row['runtimeMinutes'] > 120:
            runtime = 'long'
        elif row['runtimeMinutes'] > 90:
            runtime = 'medium'
        else:
            runtime = 'short'
    else:
        runtime = ''
    
    # Create rating category
    if pd.notna(row['averageRating']):
        if row['averageRating'] >= 7.5:
            rating = 'highly_rated'
        elif row['averageRating'] >= 6.5:
            rating = 'moderately_rated'
        else:
            rating = 'average_rated'
    else:
        rating = ''
    
    # Combine all features with repetition for important features
    return f"{genres} {directors} {writers} {year} {runtime} {rating} {tags}"

# Apply the function to create a new column with combined features
df_movie['content_features'] = df_movie.apply(create_content_features, axis=1)

# Display sample of content features
display(df_movie[['tconst', 'content_features']].head(5))

Unnamed: 0,tconst,content_features
0,tt0004972,"Drama,War nm0000428 nm0228746,nm0000428,nm0940..."
1,tt0006333,"Action,Adventure,Sci-Fi nm0665737 nm0894523,nm..."
2,tt0006864,"Drama,History nm0000428 nm0048512,nm0115218,nm..."
3,tt0010040,"Comedy,Drama nm0624714 nm0916914,nm0426515 191..."
4,tt0010323,"Horror,Mystery,Thriller nm0927468 nm0562346,nm..."


Converts the ```content_features``` column into numerical vectors using TF-IDF (Term Frequency-Inverse Document Frequency), to measure the importance of words in each movie description, using scikit-learn library's function TfidfVectorizer.

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors from the content features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movie['content_features'].fillna(''))

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features (unique words): {len(tfidf.get_feature_names_out())}")

# Convert to dataFrame and display
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
print("\nTF-IDF matrix:")
display(tfidf_df.iloc[:, :].head())

TF-IDF matrix shape: (8902, 14501)
Number of features (unique words): 14501

TF-IDF matrix:


Unnamed: 0,06,1900s,1915,1916,1919,1920,1920s,1921,1922,1923,...,wrongful,wry,york,younger,zellweger,zither,zoe,zombie,zombies,zooey
0,0.0,0.0,0.31671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.386115,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.294707,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.488328,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.474645,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate cosine similarity between all movies using ```consine_similarity``` function from the scikit-learn library, similar to back in the Collaborative Filtering notebook.

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_indices = df_movie.index

print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Convert to dataFrame and display
cosine_sim_df = pd.DataFrame(cosine_sim, index=movie_indices, columns=movie_indices)
print("\nCosine similarity to display how similar movies are:")
display(cosine_sim_df.head(5))

Cosine similarity matrix shape: (8902, 8902)

Cosine similarity to display how similar movies are:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8892,8893,8894,8895,8896,8897,8898,8899,8900,8901
0,1.0,0.005378,0.457403,0.004611,0.0,0.004276,0.0,0.014581,0.0,0.0,...,0.003002,0.0,0.0,0.0,0.005066,0.009712,0.0,0.0,0.008097,0.0
1,0.005378,1.0,0.113791,0.0,0.0,0.0,0.004427,0.0,0.0,0.008957,...,0.00307,0.003465,0.00791,0.00719,0.0,0.0,0.003176,0.006698,0.034431,0.0045
2,0.457403,0.113791,1.0,0.004484,0.011433,0.015066,0.011041,0.014179,0.009966,0.007184,...,0.002919,0.0,0.019729,0.0,0.004927,0.01852,0.007923,0.063943,0.0,0.0
3,0.004611,0.0,0.004484,1.0,0.019343,0.033812,0.0,0.014234,0.024868,0.017927,...,0.01093,0.014058,0.0,0.014758,0.040314,0.007292,0.0,0.0,0.0,0.009237
4,0.0,0.0,0.011433,0.019343,1.0,0.034748,0.043583,0.023995,0.052266,0.037677,...,0.015766,0.0,0.030405,0.0,0.021253,0.013986,0.01221,0.025746,0.030131,0.029838


Function to return the most similar movies based on a given movie title, where the parameters are: 
- ```movie_id``` - the unique ID of the movie to find recommendations for
- ```cosine_sim_matrix``` - the computed cosine similarity matrix
- ```df``` - dataFrame containing the movie dataset
- ```top_n``` - number of recommendations to return 

In [None]:
# Function to get movie recommendations
def get_recommendations(movie_id, cosine_sim_matrix, df, top_n=10):

    # Check if the movie exists in the dataset
    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return pd.DataFrame()
    
    # Get the index of the movie
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort movies based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of recommended movies
    recommended_indices = [i[0] for i in sim_scores]
    
    # Select relevant columns for output
    columns_to_return = ['tconst', 'primaryTitle', 'genres', 'startYear', 'averageRating', 'runtimeMinutes', 'directors', 'tags']
    columns_to_return = [col for col in df.columns if col in columns_to_return]  # Ensure only existing columns are used

    return df.iloc[recommended_indices][columns_to_return]

Retrieve and display recommendations for the first movie in the dataset.

In [None]:
# Example: Get recommendations for a specific movie
example_movie_id = df_movie['tconst'].iloc[0]  
print(f"Finding recommendations for movie: {example_movie_id}")

# Display information about the selected movie
movie_info = df_movie[df_movie['tconst'] == example_movie_id].iloc[0]
print(f"\nSelected movie details:")
print(f"- Title: {movie_info.get('primaryTitle', 'N/A')}")
print(f"- Genres: {movie_info.get('genres', 'N/A')}")
print(f"- Year: {movie_info.get('startYear', 'N/A')}")
print(f"- Rating: {movie_info.get('averageRating', 'N/A')}")
print(f"- Runtime: {movie_info.get('runtimeMinutes', 'N/A')} minutes")
print(f"- Directors: {movie_info.get('directors', 'N/A')}")
print(f"- Tags: {movie_info.get('tags', 'N/A')}")

# Get recommendations
recommendations = get_recommendations(example_movie_id, cosine_sim, df_movie)

print("\nTop recommended movies:")
display(recommendations)

Finding recommendations for movie: tt0004972

Selected movie details:
- Title: The Birth of a Nation
- Genres: Drama,War
- Year: 1915
- Rating: 6.1
- Runtime: 195 minutes
- Directors: nm0000428
- Tags: nan

Top recommended movies:


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,directors,tags
2,tt0006864,Intolerance,1916,163,"Drama,History",7.7,nm0000428,
6924,tt0887912,The Hurt Locker,2008,131,"Drama,Thriller,War",7.5,nm0000941,war
4231,tt0120815,Saving Private Ryan,1998,169,"Drama,War",8.6,nm0000229,world war ii
6509,tt0460989,The Wind that Shakes the Barley,2006,127,"Drama,War",7.5,nm0516360,
844,tt0059711,Shenandoah,1965,105,"Drama,War,Western",7.3,nm0572132,civil war
7531,tt1372686,Coriolanus,2011,123,"Drama,Thriller,War",6.1,nm0000146,
6226,tt0418763,Jarhead,2005,125,"Biography,Drama,War",7.0,nm0005222,"jake gyllenhaal, modern war"
5002,tt0245562,Windtalkers,2002,134,"Action,Drama,War",6.1,nm0000247,
4997,tt0245171,Invincible,2001,133,"Drama,War",6.4,nm0001348,
1020,tt0066206,Patton,1970,172,"Biography,Drama,War",7.9,nm0769874,world war ii


Analyses the most important features for a specific movie.

In [84]:
# Analyse what features are most important for a specific movie
def analyse_movie_features(movie_id, df=df_movie, tfidf=tfidf, tfidf_matrix=tfidf_matrix, top_n=10):

    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return
    
    # Get the movie index
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get the TF-IDF vector for the selected movie
    movie_vector = tfidf_matrix[idx].toarray()[0]
    
    # Get feature names
    feature_names = tfidf.get_feature_names_out()
    
    # Create a dictionary of feature importance
    feature_importance = {feature_names[i]: movie_vector[i] 
                         for i in range(len(feature_names)) 
                         if movie_vector[i] > 0}
    
    # Sort by importance
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    
    # Display movie info
    movie_info = df[df['tconst'] == movie_id].iloc[0]
    print(f"Feature analysis for movie: {movie_id}")
    print(f"Genres: {movie_info.get('genres', 'N/A')}")
    print(f"Year: {movie_info.get('startYear', 'N/A')}")
    print(f"Rating: {movie_info.get('averageRating', 'N/A')}")
    
    # Print the top features
    print(f"\nTop {top_n} most important features:")
    for feature, importance in sorted_features[:top_n]:
        print(f"  {feature}: {importance:.4f}")

# Example: Analyse a movie's features
analyse_movie_id = 'tt0004972'  # Replace with any movie ID from your dataset
analyse_movie_features(analyse_movie_id)

Feature analysis for movie: tt0004972
Genres: Drama,War
Year: 1915
Rating: 6.1

Top 10 most important features:
  nm0000428: 0.6061
  1915: 0.3167
  nm0228746: 0.3167
  nm0934306: 0.3167
  nm16280870: 0.3167
  nm16280871: 0.3167
  nm0940488: 0.3031
  war: 0.1546
  long: 0.0904
  average_rated: 0.0650
