In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("preprocessed_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,popularity,release_date,revenue,runtime,title,vote_average,vote_count,tags_vector
0,0,19995,150.437577,2009-12-10,2787965087,162.0,Avatar,7.2,11800,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,285,139.082615,2007-05-19,961000000,169.0,Pirates of the Caribbean: At World's End,6.9,4500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,206647,107.376788,2015-10-26,880674609,148.0,Spectre,6.3,4466,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,49026,112.31295,2012-07-16,1084939099,165.0,The Dark Knight Rises,7.6,9106,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,49529,43.926995,2012-03-07,284139100,132.0,John Carter,6.1,2124,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [6]:
df.drop('Unnamed: 0' , axis = 1 , inplace = True)

In [7]:
df.head()

Unnamed: 0,id,popularity,release_date,revenue,runtime,title,vote_average,vote_count,tags_vector
0,19995,150.437577,2009-12-10,2787965087,162.0,Avatar,7.2,11800,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,285,139.082615,2007-05-19,961000000,169.0,Pirates of the Caribbean: At World's End,6.9,4500,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,206647,107.376788,2015-10-26,880674609,148.0,Spectre,6.3,4466,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,49026,112.31295,2012-07-16,1084939099,165.0,The Dark Knight Rises,7.6,9106,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,49529,43.926995,2012-03-07,284139100,132.0,John Carter,6.1,2124,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [9]:
def create_movie_recommender(df):
    """
    Creates a movie recommendation system based on multiple factors:
    - Popularity
    - Vote average
    - Vote count
    - Release date recency
    - Revenue
    - Runtime similarity
    - Tags similarity
    """
    # Create copy to avoid modifying original
    movies = df.copy()
    
    # Create feature matrix
    scaler = MinMaxScaler()
    
    # Scale numerical features
    numerical_features = ['popularity', 'vote_average', 'vote_count', 'revenue', 'runtime']
    movies[numerical_features] = scaler.fit_transform(movies[numerical_features])
    
    # Convert release_date to recency score
    movies['release_date'] = pd.to_datetime(movies['release_date'])
    max_date = movies['release_date'].max()
    movies['recency'] = (max_date - movies['release_date']).dt.days
    movies['recency'] = scaler.fit_transform(movies[['recency']])
    
    # Convert tags to matrix if string
    if isinstance(movies['tags_vector'].iloc[0], str):
        movies['tags_vector'] = movies['tags_vector'].apply(eval)
    
    def get_recommendations(movie_id, n=5):
        """Get movie recommendations based on similarity to given movie."""
        movie_idx = movies[movies['id'] == movie_id].index[0]
        
        # Calculate similarities
        tags_sim = cosine_similarity([movies.iloc[movie_idx]['tags_vector']], 
                                   movies['tags_vector'].tolist())[0]
        
        # Combine features with weights
        feature_matrix = np.column_stack((
            movies['popularity'],        # 0.2
            movies['vote_average'],      # 0.2
            movies['vote_count'],        # 0.1
            movies['recency'],           # 0.1
            movies['revenue'],           # 0.1
            movies['runtime'],           # 0.1
            tags_sim                     # 0.2
        ))
        
        weights = np.array([0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.2])
        weighted_scores = np.dot(feature_matrix, weights)
        
        # Get top recommendations
        movie_indices = np.argsort(weighted_scores)[::-1]
        movie_indices = movie_indices[movie_indices != movie_idx][:n]
        
        return movies.iloc[movie_indices][['title', 'vote_average', 'popularity']]
    
    return get_recommendations

In [12]:
recommender = create_movie_recommender(df)
recommendations = recommender(movie_id=19995)  # Avatar

In [13]:
recommendations

Unnamed: 0,title,vote_average,popularity
95,Interstellar,0.81,0.827162
546,Minions,0.64,1.0
94,Guardians of the Galaxy,0.79,0.549462
788,Deadpool,0.74,0.587689
28,Jurassic World,0.65,0.478206


Calculates similarities using:
Content (tags) similarity: 20%
Popularity: 20%
Vote average: 20%
Other factors (revenue, runtime, recency, vote count): 40%