In [6]:
import pandas as pd

movies_df = pd.read_csv('tmdb_5000_movies.csv')
credits_df = pd.read_csv('tmdb_5000_credits.csv')

merged_df = movies_df.merge(credits_df, on='title')


In [7]:
import ast

# Function to convert JSON-like string to a list of names
def parse_names(data):
    try:
        return [item['name'] for item in ast.literal_eval(data)]
    except (ValueError, SyntaxError):
        return []

# Function to get the top N cast members
def get_top_cast(data, n=3):
    try:
        return [item['name'] for item in ast.literal_eval(data)[:n]]
    except (ValueError, SyntaxError):
        return []

# Function to get the director's name from crew
def get_director(data):
    try:
        for item in ast.literal_eval(data):
            if item['job'] == 'Director':
                return item['name']
        return ''
    except (ValueError, SyntaxError):
        return ''

merged_df['genres'] = merged_df['genres'].apply(parse_names)
merged_df['keywords'] = merged_df['keywords'].apply(parse_names)
merged_df['cast'] = merged_df['cast'].apply(lambda x: get_top_cast(x, n=3))
merged_df['director'] = merged_df['crew'].apply(get_director)

merged_df['overview'] = merged_df['overview'].fillna('')
merged_df['director'] = merged_df['director'].fillna('')

merged_df['combined_features'] = merged_df.apply(
    lambda x: ' '.join(x['genres']) + ' ' +
              ' '.join(x['keywords']) + ' ' +
              ' '.join(x['cast']) + ' ' +
              x['director'] + ' ' +
              x['overview'],
    axis=1
)


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorizing the combined features
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(merged_df['combined_features'])
#Calculating the cosine similarity
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [9]:
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
merged_df['encoded_title'] = label_encoder.fit_transform(merged_df['title'])

# Neural network model
input_layer = Input(shape=(count_matrix.shape[1],))
dense_layer_1 = Dense(512, activation='relu')(input_layer)
dense_layer_2 = Dense(256, activation='relu')(dense_layer_1)
dense_layer_3 = Dense(128, activation='relu')(dense_layer_2)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(dense_layer_3)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(count_matrix.toarray(), merged_df['encoded_title'], epochs=10, batch_size=64, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7907a416e440>

In [15]:
indices = pd.Series(merged_df.index, index=merged_df['title']).drop_duplicates()

def get_cb_recommendations(title, cosine_sim=cosine_sim, top_n=5):
    if title not in indices:
        return []
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return merged_df['title'].iloc[movie_indices].tolist()

def get_nn_recommendations(title, model=model, top_n=5):
    if title not in indices:
        return []
    idx = indices[title]
    input_vec = count_matrix[idx].toarray()
    preds = model.predict(input_vec)
    recommended_indices = preds.argsort()[0][-top_n:][::-1]
    recommended_titles = label_encoder.inverse_transform(recommended_indices)
    return recommended_titles.tolist()

# Combining the both model
def get_combined_recommendations(title, top_n=10):
    cb_recs = get_cb_recommendations(title, top_n=top_n//2)
    nn_recs = get_nn_recommendations(title, top_n=top_n//2)
    combined_recs = list(dict.fromkeys(cb_recs + nn_recs))
    return combined_recs

print(get_combined_recommendations('Cars 2'))


['Cars', 'Herbie Fully Loaded', 'The Final Destination', 'Furious 7', 'The Fast and the Furious: Tokyo Drift', 'Cars 2', 'Jonah: A VeggieTales Movie', 'Made of Honor', 'Cheaper by the Dozen', 'Without a Paddle']
