In this project, the content-based recommendation system has been implemented.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")
# credits.head()

In [None]:
# movies.head()

In [4]:
print("Credits:",credits.shape)
print("Movies Dataframe:",movies.shape)

Credits: (4803, 4)
Movies Dataframe: (4803, 20)


In [5]:
# Renaming the 'movie_id' column in the credits DataFrame to 'id' for merging
credits_column_renamed = credits.rename(index=str, columns={"movie_id": "id"})

# Merging the movies and credits DataFrames on the 'id' column
movies_merge = movies.merge(credits_column_renamed, on='id')

# Printing the shape of the merged DataFrame to see the number of rows and columns
print(movies_merge.shape)


(4803, 23)


In [6]:
# Drop unnecessary columns from the merged DataFrame
movies_cleaned = movies_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status', 'production_countries'])

# Print the first 5 rows of the cleaned DataFrame to inspect the changes
# print(movies_cleaned.head())

# Print information about the cleaned DataFrame to understand its structure and data types
# print(movies_cleaned.info())

# Print the 'overview' column of the first row to see the movie description
print(movies_cleaned.head(1)['overview'])


0    In the 22nd century, a paraplegic Marine is di...
Name: overview, dtype: object


In [7]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the TfidfVectorizer with specific parameters
tfv = TfidfVectorizer(
    min_df=3,  # Ignore terms that appear in fewer than 3 documents
    max_features=None,  # Do not limit the number of features (terms)
    strip_accents='unicode',  # Remove accents by converting characters to their closest ASCII representation
    analyzer='word',  # Analyze at the word level (as opposed to character level)
    token_pattern=r'\w{1,}',  # Regular expression matching words with 1 or more alphanumeric characters
    ngram_range=(1, 3),  # Consider unigrams (1-word), bigrams (2-word), and trigrams (3-word) combinations
    stop_words='english'  # Remove common English stop words
)


In [8]:
# Fill any missing values in the 'overview' column with an empty string
movies_cleaned['overview'] = movies_cleaned['overview'].fillna('')

# Transform the 'overview' text data to a matrix of TF-IDF features
tfv_matrix = tfv.fit_transform(movies_cleaned['overview'])

# Print the shape of the TF-IDF matrix to see the number of documents and features
print(tfv_matrix.shape)


(4803, 10417)


In [9]:

from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel between the TF-IDF matrices
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)

# Print the similarity scores for the first movie
print(sig[0])


[0.76163447 0.76159416 0.76159416 ... 0.76159416 0.76159416 0.76159416]


In [15]:
# Create a reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()

# Print the reverse mapping of indices and movie titles
# print(indices)

# Print the index of the movie 'Newlyweds'
print(indices['Newlyweds'])

# Print the sigmoid similarity scores for the movie at index 4799 (which corresponds to 'Newlyweds')
print(sig[4799])

# Enumerate the sigmoid similarity scores for 'Newlyweds'
enumerated_scores = list(enumerate(sig[indices['Newlyweds']]))

# Sort the similarity scores in descending order
sorted_scores = sorted(enumerated_scores, key=lambda x: x[1], reverse=True)

# Print only the first 10 values
print(sorted_scores[:10])



4799
[0.76159416 0.76159416 0.76159438 ... 0.76159432 0.76159416 0.76159478]
[(4799, 0.7616344692549826), (616, 0.7616048159533783), (2689, 0.7616040118828756), (869, 0.7616023446645636), (3969, 0.7615999241031715), (1576, 0.761599897054374), (2290, 0.7615997916001525), (1032, 0.7615997293504287), (3145, 0.7615995818321376), (2531, 0.7615992277356394)]


In [13]:
def give_recomendations(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_cleaned['original_title'].iloc[movie_indices]

In [14]:
print(give_recomendations('Newlyweds'))

616                       Ted 2
2689         Our Family Wedding
869          You, Me and Dupree
3969           Something Wicked
1576                 Bride Wars
2290               Just Married
1032      America's Sweethearts
3145                      Amour
2531     Why Did I Get Married?
504     The Secret Life of Pets
Name: original_title, dtype: object
