In [19]:
import numpy as np
import pandas as pd
import ast
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load datasets
movie = pd.read_csv('data/tmdb_5000_movies.csv', engine='python')
credits = pd.read_csv("data/tmdb_5000_credits.csv", engine='python')

# Merge both datasets on title
movies = movie.merge(credits, on='title')

# Keep important columns
movies = movies[['id', 'title', 'overview', 'keywords', 'genres', 'cast', 'crew']]

# Drop rows with null values
movies.dropna(inplace=True)

In [20]:
print(movie.shape)
print(credits.shape)

(4803, 20)
(4803, 4)


In [21]:
movies = movie.merge(credits, on='title')
print(movies.shape)

(4809, 23)


In [12]:
# Process 'overview' column
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Process 'keywords' column
def convert_keywords(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['keywords'] = movies['keywords'].apply(convert_keywords)

In [13]:
# Process 'genres' column
def convert_genres(obj):
    l = []
    for i in ast.literal_eval(obj):
        l.append(i['name'])
    return l

movies['genres'] = movies['genres'].apply(convert_genres)

In [14]:
# Process 'cast' column (top 3 only)
def extract_cast(obj):
    l = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            l.append(i['name'])
            count += 1
        else:
            break
    return l

movies['cast'] = movies['cast'].apply(extract_cast)

In [None]:
# Process 'crew' column (get director only)
def extract_director(obj):
    l = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            l.append(i['name'])
            break
    return l

movies['crew'] = movies['crew'].apply(extract_director)

# Create 'tags' column by combining overview + keywords + genres + cast + crew
movies['tags'] = movies['overview'] + movies['cast'] + movies['crew'] + movies['keywords']

# Final dataset with relevant columns
movies = movies[['id', 'title', 'tags']]

In [None]:
# Remove spaces from tags
movies['tags'] = movies['tags'].apply(lambda x: [i.replace(" ", "") for i in x])

# Stemming
ps = PorterStemmer()

def stemming(text):
    l = []
    for i in text:
        l.append(ps.stem(i))
    return " ".join(l)

movies['tags'] = movies['tags'].apply(stemming)

In [None]:
# Vectorization
vectorizer = CountVectorizer(max_features=500, stop_words='english')
vectors = vectorizer.fit_transform(movies['tags']).toarray()

# Cosine similarity
similarity = cosine_similarity(vectors)

In [None]:
# Recommendation function
def Recommendation_system(movie_title):
    movie_index = movies[movies['title'] == movie_title].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])
    
    for i in distances[1:20]:
        print(movies.iloc[i[0]].title)

# Example usage
# Recommendation_system('Avatar')

In [None]:
# Save the model and data
pickle.dump(movies, open('model.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [23]:
# Test data_loader module
from src.data_loader import load_data

df = load_data()
print(df.head())

      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
3  250000000  [{"id": 28, "name": "Action"}, {"id": 80, "nam...   
4  260000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   
2  [{"id": 470, "nam

In [22]:
# Test preprocess module
from src.data_loader import load_data
from src.preprocess import preprocess_movies

df = load_data()
print("original data", df.shape)
print(df['title'].duplicated().sum())

df2 = preprocess_movies(df)
print(df2[['genres','cast','crew']].head())
print("after prepare", df2.shape)
print(df2.columns)

original data (4809, 23)
9
                                         genres  \
0  [Action, Adventure, Fantasy, ScienceFiction]   
1                  [Adventure, Fantasy, Action]   
2                    [Action, Adventure, Crime]   
3              [Action, Crime, Drama, Thriller]   
4           [Action, Adventure, ScienceFiction]   

                                            cast                crew  
0  [SamWorthington, ZoeSaldana, SigourneyWeaver]      [JamesCameron]  
1     [JohnnyDepp, OrlandoBloom, KeiraKnightley]     [GoreVerbinski]  
2      [DanielCraig, ChristophWaltz, LéaSeydoux]         [SamMendes]  
3      [ChristianBale, MichaelCaine, GaryOldman]  [ChristopherNolan]  
4    [TaylorKitsch, LynnCollins, SamanthaMorton]     [AndrewStanton]  
after prepare (4809, 23)
Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'run

In [None]:
# Test feature_builder module
from src.data_loader import load_data
from src.preprocess import preprocess_movies
from src.feature_builder import build_tags

df = load_data()
df2 = preprocess_movies(df)
movies = build_tags(df2)

print(movies.shape)
movies.head()

(4809, 3)


Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."


In [25]:
# Test vectorizer module
from src.vectorizer import build_vectors

vectors, vectorizer = build_vectors(movies["tags"])
print(vectors.shape)

(4809, 5000)


In [26]:
# Test similarity_model module
from src.similarity_model import compute_similarity

similarity = compute_similarity(vectors)
print(similarity.shape)

(4809, 4809)


In [27]:
# Test recommender module
from src.recommender import recommend
recommend("Avatar", movies, similarity, top_k=10)

Unnamed: 0,title
1214,Aliens vs Predator: Requiem
2405,Aliens
3729,Falcon Rising
507,Independence Day
539,Titan A.E.
582,Battle: Los Angeles
1202,Predators
1192,Small Soldiers
61,Jupiter Ascending
778,Meet Dave
