# Netfilx Movie Recommendation Model

In [19]:
# Import packages
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from scipy.sparse import hstack

In [20]:
# Load datasets
movies_df = pd.read_csv('Dataset/movies.csv')
users_df = pd.read_csv('Dataset/users.csv')
reviews_df = pd.read_csv('Dataset/reviews.csv')

In [21]:
# Group together limited series and TV Show to TV
movies_df['content_type'] = movies_df['content_type'].replace({'Limited Series': 'TV','TV Show': 'TV'})

# Divide movies.csv by each 'content_type' ( Stand-up Comedy, Movie, Documentary, TV )
content_types = movies_df['content_type'].unique()
media_dict = {ctype: movies_df[movies_df['content_type'] == ctype] for ctype in content_types}

# Turn rating into a numerical value, group together similar TV/ movie ratings
rating_map = {
    'TV-Y': 1, 'TV-Y7': 1,
    'TV-G': 2, 'G': 2,
    'TV-PG': 3, 'PG': 3,
    'TV-14': 4, 'PG-13': 4,
    'TV-MA': 5, 'R': 5, 'NC-17': 5
}

movies_df["rating"] = movies_df["rating"].map(rating_map)
movies_df["rating"] = movies_df["rating"].fillna(0)

In [41]:
# TODO: add weight to each feature, not all features should be the same

# TODO: add new features created from review/recommendation_logs data

# Combine text features into 1 field
movies_df["text_features"] = (
    movies_df["genre_primary"].fillna('') + " " +
    movies_df["genre_secondary"].fillna('') + " " +
    movies_df["language"].fillna('') + " " +
    movies_df["country_of_origin"].fillna('') + " " +
    movies_df["content_type"].fillna('') + " "
)

In [42]:
# Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df["text_features"])

# Take numeric features and normalize 
numeric_features = movies_df[["imdb_rating", "release_year", "rating"]].fillna(0)
scaler = StandardScaler()
scaled_numeric = scaler.fit_transform(numeric_features)

# Come text and numeric features
combined_features = hstack([tfidf_matrix, scaled_numeric])

# Compute similarity matrix
similarity_matrix = cosine_similarity(combined_features, combined_features)

In [43]:
# TODO: Split up this function for movies / tv. Tv should incorporate things such as # of seasons/ episodes

# Recommendation function

# Params: title - the title of the input movie. n - number of movies to recommend. 
def recommend(title, n=5):
    if title not in movies_df["title"].values:
        print(f"'{title}' not found in dataset.")
        return []
    
    idx = movies_df[movies_df["title"] == title].index[0]
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    top_indices = [i for i, _ in similarity_scores[1:n+1]]
    return movies_df.iloc[top_indices][["title", "genre_primary", "genre_secondary", "language", "country_of_origin", "imdb_rating", "release_year", "rating", "content_type"]]


In [44]:
# Example usage:
recommend("Fire Family")

Unnamed: 0,title,genre_primary,genre_secondary,language,country_of_origin,imdb_rating,release_year,rating,content_type
484,Bright Queen,Drama,,French,USA,7.8,1998,5,Movie
899,Ice Love,Drama,,Japanese,USA,9.5,2001,5,Documentary
158,Battle Family,Drama,Mystery,Korean,India,7.7,2001,5,Movie
896,First Storm,Western,,English,USA,9.5,1995,5,Movie
661,Bright Family,Documentary,,English,USA,7.9,1997,5,Movie
