In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error


def recommend_movies(input_titles, data_path='cleaned_movies.csv', top_n=20, weights=None):
    """
    Recommend movies based on input movie titles using multiple features with adjustable weights.
    """
    movies_df = pd.read_csv(data_path)

    if weights is None:
        weights = {
            'title': 0.5,
            'user_rating': 0.1,
            'keywords': 3.7,
            'director': 0.2,
            'adult': 1.0,
            'genres': 0.6
        }

    movies_df['normalized_rating'] = movies_df['user_rating'] / movies_df['user_rating'].max()

    tfidf_title = TfidfVectorizer(stop_words='english')
    title_matrix = tfidf_title.fit_transform(movies_df['title'].fillna(''))

    tfidf_keywords = TfidfVectorizer(stop_words='english')
    keywords_matrix = tfidf_keywords.fit_transform(movies_df['keywords'].fillna(''))

    tfidf_director = TfidfVectorizer(stop_words='english')
    director_matrix = tfidf_director.fit_transform(movies_df['director'].fillna(''))

    title_similarity = cosine_similarity(title_matrix, title_matrix) * weights['title']
    keywords_similarity = cosine_similarity(keywords_matrix, keywords_matrix) * weights['keywords']
    director_similarity = cosine_similarity(director_matrix, director_matrix) * weights['director']

    textual_similarity = title_similarity + keywords_similarity + director_similarity

    genre_columns = [
        'Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary',
        'Drama', 'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery',
        'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'Unknown', 'War', 'Western'
    ]
    genre_matrix = movies_df[genre_columns].values
    genre_similarity = cosine_similarity(genre_matrix, genre_matrix)

    total_similarity = (
    textual_similarity * (1 - weights['genres']) +
    genre_similarity * weights['genres']
    )

    movie_indices = []
    for title in input_titles:
        if title in movies_df['title'].values:
            idx = movies_df[movies_df['title'] == title].index
            if len(idx) > 0:
                movie_indices.append(idx[0])

    if not movie_indices:
        print("No valid movie indices found. Check input titles.")
        return []

    sim_scores = total_similarity[movie_indices].sum(axis=0)

    sim_scores = sim_scores * (movies_df['normalized_rating'] ** weights['user_rating'])

    if not any(movies_df.iloc[idx]['adult'] for idx in movie_indices):
        sim_scores[movies_df['adult'] == True] *= 0.5  

    
    sim_scores = [(i, score) for i, score in enumerate(sim_scores)]
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)


    input_indices = set(movie_indices)
    recommendations = [(i, score) for i, score in sim_scores if i not in input_indices and score > 0]

    top_recommendations = recommendations[:top_n]
    recommended_titles = [movies_df.iloc[i]['title'] for i, _ in top_recommendations]

    return recommended_titles


