Content based

In [1]:
import os
import pandas as pd
import numpy as np

script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

# Load ratings data
movies_file = os.path.join(script_dir, "Cleaned Datasets", "Final_Movie_Data.tsv")
df_movie = pd.read_csv(movies_file, delimiter='\t') 

display(df_movie.head(5))

Current working directory: c:\Users\willi\Documents\GitHub\Movie-Recommendation-System


Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0000574,The Story of the Kelly Gang,0,1906,70.0,"Action,Adventure,Biography",6.0,nm0846879,nm0846879,"partially lost film, silent film"
1,tt0002130,Dante's Inferno,0,1911,71.0,"Adventure,Drama,Fantasy",7.1,"nm0078205,nm0655824,nm0209738",nm0019604,"youtube, based on song, poem or rhyme, hell"
2,tt0002423,Passion,0,1919,113.0,"Biography,Drama,Romance",6.7,nm0523932,"nm0266183,nm0473134",
3,tt0002844,Fantômas: In the Shadow of the Guillotine,0,1913,54.0,"Crime,Drama",6.9,nm0275421,"nm0019855,nm0275421,nm0816232",
4,tt0003014,Ingeborg Holm,0,1913,96.0,Drama,7.0,nm0803705,"nm0472236,nm0803705","poverty, workhouse, bankruptcy, broken heart, ..."


In [2]:
# Display dataset information
print("Dataset shape:", df_movie.shape)
print("\nColumns:", df_movie.columns.tolist())
print("\nSample data:")
display(df_movie.head())

# Check for missing values
print("\nMissing values per column:")
print(df_movie.isnull().sum())

Dataset shape: (38178, 10)

Columns: ['tconst', 'primaryTitle', 'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'directors', 'writers', 'tags']

Sample data:


Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers,tags
0,tt0000574,The Story of the Kelly Gang,0,1906,70.0,"Action,Adventure,Biography",6.0,nm0846879,nm0846879,"partially lost film, silent film"
1,tt0002130,Dante's Inferno,0,1911,71.0,"Adventure,Drama,Fantasy",7.1,"nm0078205,nm0655824,nm0209738",nm0019604,"youtube, based on song, poem or rhyme, hell"
2,tt0002423,Passion,0,1919,113.0,"Biography,Drama,Romance",6.7,nm0523932,"nm0266183,nm0473134",
3,tt0002844,Fantômas: In the Shadow of the Guillotine,0,1913,54.0,"Crime,Drama",6.9,nm0275421,"nm0019855,nm0275421,nm0816232",
4,tt0003014,Ingeborg Holm,0,1913,96.0,Drama,7.0,nm0803705,"nm0472236,nm0803705","poverty, workhouse, bankruptcy, broken heart, ..."



Missing values per column:
tconst               0
primaryTitle         0
isAdult              0
startYear            0
runtimeMinutes       3
genres               2
averageRating        0
directors            4
writers            797
tags              7404
dtype: int64


This function creates a single text-based feature that combines multiple movie attributes (genres, directors, writers, year, runtime, rating and tags categories). It processes each row and builds a string that represents the movie in a way that can be used for content-based filtering.

In [3]:
# Function to combine relevant features into a single text representation
def create_content_features(row):
    # Convert numeric values to strings and handle potential NaN values
    directors = str(row['directors']) if pd.notna(row['directors']) else ''
    writers = str(row['writers']) if pd.notna(row['writers']) else ''
    genres = str(row['genres']) if pd.notna(row['genres']) else ''
    year = str(row['startYear']) if pd.notna(row['startYear']) else ''
    tags = str(row['tags']) if pd.notna(row['tags']) else ''
    
    # Create runtime category
    if pd.notna(row['runtimeMinutes']):
        if row['runtimeMinutes'] > 120:
            runtime = 'long'
        elif row['runtimeMinutes'] > 90:
            runtime = 'medium'
        else:
            runtime = 'short'
    else:
        runtime = ''
    
    # Create rating category
    if pd.notna(row['averageRating']):
        if row['averageRating'] >= 7.5:
            rating = 'highly_rated'
        elif row['averageRating'] >= 6.5:
            rating = 'moderately_rated'
        else:
            rating = 'average_rated'
    else:
        rating = ''
    
    # Combine all features with repetition for important features
    return f"{genres} {directors} {writers} {year} {runtime} {rating} {tags}"

# Apply the function to create a new column with combined features
df_movie['content_features'] = df_movie.apply(create_content_features, axis=1)

# Display sample of content features
display(df_movie[['tconst', 'content_features']].head(5))

Unnamed: 0,tconst,content_features
0,tt0000574,"Action,Adventure,Biography nm0846879 nm0846879..."
1,tt0002130,"Adventure,Drama,Fantasy nm0078205,nm0655824,nm..."
2,tt0002423,"Biography,Drama,Romance nm0523932 nm0266183,nm..."
3,tt0002844,"Crime,Drama nm0275421 nm0019855,nm0275421,nm08..."
4,tt0003014,"Drama nm0803705 nm0472236,nm0803705 1913 mediu..."


Converts the ```content_features``` column into numerical vectors using TF-IDF (Term Frequency-Inverse Document Frequency), to measure the importance of words in each movie description, using scikit-learn library's function TfidfVectorizer.

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors from the content features
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_movie['content_features'].fillna(''))

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features (unique words): {len(tfidf.get_feature_names_out())}")
print(f"Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%")

# Display a small sample by converting only a few rows
print("\nSample TF-IDF values (first 5 rows, first 10 features):")
sample_array = tfidf_matrix[:5, :10].toarray()
sample_features = tfidf.get_feature_names_out()[:10]
tfidf_sample = pd.DataFrame(sample_array, columns=sample_features)
display(tfidf_sample)

TF-IDF matrix shape: (38178, 95125)
Number of features (unique words): 95125
Sparsity: 99.96%

Sample TF-IDF values (first 5 rows, first 10 features):


Unnamed: 0,000,007,009,01,02,03,04,05,06,07
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate cosine similarity between all movies using ```consine_similarity``` function from the scikit-learn library, similar to back in the Collaborative Filtering notebook.

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
movie_indices = df_movie.index

print(f"Cosine similarity matrix shape: {cosine_sim.shape}")

# Convert to dataFrame and display
cosine_sim_df = pd.DataFrame(cosine_sim, index=movie_indices, columns=movie_indices)
print("\nCosine similarity to display how similar movies are:")
display(cosine_sim_df.head(5))

Cosine similarity matrix shape: (38178, 38178)

Cosine similarity to display how similar movies are:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38168,38169,38170,38171,38172,38173,38174,38175,38176,38177
0,1.0,0.019049,0.027762,0.007625,0.045803,0.007625,0.00756,0.009686,0.012301,0.053541,...,0.01371,0.035023,0.010127,0.0,0.018219,0.00179,0.005625,0.0,0.01086,0.004008
1,0.019049,1.0,0.011605,0.016249,0.005442,0.016249,0.016111,0.023862,0.011444,0.013638,...,0.01881,0.01202,0.0,0.011434,0.004315,0.007965,0.0,0.009208,0.006976,0.003351
2,0.027762,0.011605,1.0,0.013078,0.011115,0.013078,0.012966,0.004157,0.00528,0.007983,...,0.012809,0.0,0.0,0.023353,0.012672,0.002164,0.006801,0.018806,0.0,0.009839
3,0.007625,0.016249,0.013078,1.0,0.098534,1.0,0.991461,0.134212,0.02487,0.006038,...,0.006056,0.031762,0.011311,0.012886,0.004863,0.0,0.0,0.010376,0.007861,0.003776
4,0.045803,0.005442,0.011115,0.098534,1.0,0.098534,0.097692,0.078873,0.002476,0.029534,...,0.006007,0.0,0.0,0.010951,0.005942,0.001015,0.003189,0.008819,0.0,0.004614


Function to return the most similar movies based on a given movie title, where the parameters are: 
- ```movie_id``` - the unique ID of the movie to find recommendations for
- ```cosine_sim_matrix``` - the computed cosine similarity matrix
- ```df``` - dataFrame containing the movie dataset
- ```top_n``` - number of recommendations to return 

In [6]:
# Function to get movie recommendations
def get_recommendations(movie_id, cosine_sim_matrix, df, top_n=10):

    # Check if the movie exists in the dataset
    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return pd.DataFrame()
    
    # Get the index of the movie
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get similarity scores for all movies
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    
    # Sort movies based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the indices of recommended movies
    recommended_indices = [i[0] for i in sim_scores]
    
    # Select relevant columns for output
    columns_to_return = ['tconst', 'primaryTitle', 'genres', 'startYear', 'averageRating', 'runtimeMinutes', 'directors', 'tags']
    columns_to_return = [col for col in df.columns if col in columns_to_return]  # Ensure only existing columns are used

    return df.iloc[recommended_indices][columns_to_return]

Retrieve and display recommendations for the first movie in the dataset.

In [7]:
# Example: Get recommendations for a specific movie
example_movie_id = df_movie['tconst'].iloc[0]  
print(f"Finding recommendations for movie: {example_movie_id}")

# Display information about the selected movie
movie_info = df_movie[df_movie['tconst'] == example_movie_id].iloc[0]
print(f"\nSelected movie details:")
print(f"- Title: {movie_info.get('primaryTitle', 'N/A')}")
print(f"- Genres: {movie_info.get('genres', 'N/A')}")
print(f"- Year: {movie_info.get('startYear', 'N/A')}")
print(f"- Rating: {movie_info.get('averageRating', 'N/A')}")
print(f"- Runtime: {movie_info.get('runtimeMinutes', 'N/A')} minutes")
print(f"- Directors: {movie_info.get('directors', 'N/A')}")
print(f"- Tags: {movie_info.get('tags', 'N/A')}")

# Get recommendations
recommendations = get_recommendations(example_movie_id, cosine_sim, df_movie)

print("\nTop recommended movies:")
display(recommendations)

Finding recommendations for movie: tt0000574

Selected movie details:
- Title: The Story of the Kelly Gang
- Genres: Action,Adventure,Biography
- Year: 1906
- Rating: 6.0
- Runtime: 70.0 minutes
- Directors: nm0846879
- Tags: partially lost film, silent film

Top recommended movies:


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes,genres,averageRating,directors,tags
17234,tt0359715,My Best Friend's Birthday,1987,69.0,Comedy,5.5,nm0000233,"birthday gifts, call girl, cocaine, cult film,..."
22,tt0006206,Les Vampires,1915,421.0,"Action,Adventure,Crime",7.3,nm0275421,"gang, long, master criminals, paris, silent, 1..."
354,tt0022080,Limit,1931,114.0,"Drama,Romance",7.0,nm0670643,"adrift, arms, avant garde, barge, boat, brazil..."
195,tt0018192,Napoleon,1927,330.0,"Biography,Drama,History",8.2,nm0304098,"abel gance, cinematography, ethics, music, rom..."
250,tt0019729,The Broadway Melody,1929,100.0,"Drama,Musical,Romance",5.5,nm0064600,"prospect preferred, oscar (best picture), bd-r..."
132,tt0015772,The Eagle,1925,73.0,"Action,Adventure,Comedy",6.6,nm0113284,silent film
130,tt0015634,Body and Soul,1925,102.0,"Crime,Drama,Thriller",6.2,nm0584778,"director poc, african american, based on novel..."
26423,tt1655442,The Artist,2011,100.0,"Comedy,Drama,Romance",7.8,nm0371890,"actor's life, dogs, memasa's movies, netflix f..."
216,tt0018806,The Crowd,1928,98.0,"Drama,Romance",8.0,nm0896542,"national film registry, tumey's dvds, usa film..."
207,tt0018528,The Unknown,1927,68.0,"Drama,Horror,Romance",7.7,nm0115218,"tod browning, circus, amputation, black and wh..."


Analyses the most important features for a specific movie.

In [None]:
# Analyse what features are most important for a specific movie
def analyse_movie_features(movie_id, df=df_movie, tfidf=tfidf, tfidf_matrix=tfidf_matrix, top_n=10):

    if movie_id not in df['tconst'].values:
        print(f"Movie {movie_id} not found in the dataset.")
        return
    
    # Get the movie index
    idx = df.index[df['tconst'] == movie_id].tolist()[0]
    
    # Get the TF-IDF vector for the selected movie
    movie_vector = tfidf_matrix[idx].toarray()[0]
    
    # Get feature names
    feature_names = tfidf.get_feature_names_out()
    
    # Create a dictionary of feature importance
    feature_importance = {feature_names[i]: movie_vector[i] 
                         for i in range(len(feature_names)) 
                         if movie_vector[i] > 0}
    
    # Sort by importance
    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
    
    # Display movie info
    movie_info = df[df['tconst'] == movie_id].iloc[0]
    print(f"Feature analysis for movie: {movie_id}")
    print(f"Genres: {movie_info.get('genres', 'N/A')}")
    print(f"Year: {movie_info.get('startYear', 'N/A')}")
    print(f"Rating: {movie_info.get('averageRating', 'N/A')}")
    
    # Print top features
    print(f"\nTop {top_n} most important features:")
    for feature, importance in sorted_features[:top_n]:
        print(f"  {feature}: {importance:.4f}")

# Example: Analyse a movie's features
analyse_movie_id = 'tt0004972'  # Replace with any movie ID
analyse_movie_features(analyse_movie_id)

Feature analysis for movie: tt0004972
Genres: Drama,War
Year: 1915
Rating: 6.1

Top 10 most important features:
  nm0000428: 0.3145
  theory: 0.2243
  criticism: 0.2206
  racist: 0.2136
  film: 0.1994
  civil: 0.1979
  afi: 0.1955
  interracial: 0.1902
  nm0228746: 0.1846
  nm0934306: 0.1846
