In [12]:
# Import dependencies
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import hstack
from sklearn.metrics import pairwise_distances

# NOTE:  Before running code below, download the following:  nltk.download('punkt')
# nltk.download('punkt')


# Features = genres, overview, creator names

## KNN Model

In [13]:
# Load the dataframe
shows = pd.read_csv('../csv/tv_shows_data_cleaned.csv')

# Preprocessing
shows = shows.drop(["id", "original_name", "number_of_seasons", "first_air_date", "last_air_date",
                        "in_production", "origin_country", "original_language", "tagline",
                        "production_names", "production_logo_paths", "production_origin_countries", "poster_path"], axis=1)

# Fill missing values with an empty string
shows["overview"].fillna("", inplace=True)  
shows["genre_name"].fillna("", inplace=True) 
shows["creator_names"].fillna("", inplace=True) 

# Text vectorization
overview_vectorizer = TfidfVectorizer(stop_words="english")
overview_vectors = overview_vectorizer.fit_transform(shows["overview"])

genre_vectorizer = TfidfVectorizer()
genre_vectors = genre_vectorizer.fit_transform(shows["genre_name"])

creator_vectorizer = TfidfVectorizer()
creator_vectors = creator_vectorizer.fit_transform(shows["creator_names"])

# Concatenate feature vectors
combined_vectors = hstack((overview_vectors, genre_vectors, creator_vectors))

# Convert sparse matrix to dense array
combined_vectors = combined_vectors.toarray()

# Build the similarity calculation
knn_model = NearestNeighbors(metric="correlation")
knn_model.fit(combined_vectors)


# Define recommendation function
def knn_recommend_similar_shows(show_title, num_recommendations=5):
    show_index = shows[shows["name"] == show_title].index[0]
    # Convert the query vector to dense array
    query_vector = combined_vectors[show_index].reshape(1, -1)
    # Calculate pairwise distances using correlation metric
    distances, indices = knn_model.kneighbors(query_vector, n_neighbors=num_recommendations+1)
    recommended_show_indices = indices.flatten()[1:]
    recommended_shows = shows.loc[recommended_show_indices, "name"]
    return recommended_shows


### Recommendations

In [14]:
# KNN: Make recommendation
input_show = "Game of Thrones"
recommendations = knn_recommend_similar_shows(input_show)
print(f"KNN Model recommended shows similar to '{input_show}':")
print(recommendations)

KNN Model recommended shows similar to 'Game of Thrones':
3356                                Fate/Apocrypha
1595                      The Chronicles of Narnia
821                              Man from Atlantis
3204    Marvel's Agents of S.H.I.E.L.D.: Slingshot
2435                        H+: The Digital Series
Name: name, dtype: object


# Features = genres, overview, creator names & POPULARITY

## KNN Model

In [15]:
# Load the dataframe
shows_2 = pd.read_csv('../csv/tv_shows_data_cleaned.csv')

# Preprocessing
shows_2 = shows_2.drop(["id", "original_name", "number_of_seasons", "first_air_date", "last_air_date",
                        "in_production", "origin_country", "original_language", "tagline",
                        "production_names", "production_logo_paths", "production_origin_countries", "poster_path"], axis=1)

# Fill missing values with an empty string
shows_2["overview"].fillna("", inplace=True)  
shows_2["genre_name"].fillna("", inplace=True) 
shows_2["creator_names"].fillna("", inplace=True) 
shows_2["popularity"].fillna("", inplace=True) 

# Text vectorization
overview_vectorizer = TfidfVectorizer(stop_words="english")
overview_vectors = overview_vectorizer.fit_transform(shows_2["overview"])

genre_vectorizer = TfidfVectorizer()
genre_vectors = genre_vectorizer.fit_transform(shows_2["genre_name"])

creator_vectorizer = TfidfVectorizer()
creator_vectors = creator_vectorizer.fit_transform(shows_2["creator_names"])

# Add popularity as a separate feature vector
popularity_vector = shows_2["popularity"].values.reshape(-1, 1)

# Concatenate feature vectors
combined_vectors = hstack((overview_vectors, genre_vectors, creator_vectors, popularity_vector))

# Convert sparse matrix to dense array
combined_vectors = combined_vectors.toarray()

# Build the similarity calculation
knn_model = NearestNeighbors(metric="correlation")
knn_model.fit(combined_vectors)


# Define recommendation function
def knn_recommend_similar_shows_2(show_title, num_recommendations=5):
    show_index = shows_2[shows_2["name"] == show_title].index[0]
    # Convert the query vector to dense array
    query_vector = combined_vectors[show_index].reshape(1, -1)
    # Calculate pairwise distances using correlation metric
    distances, indices = knn_model.kneighbors(query_vector, n_neighbors=num_recommendations+1)
    recommended_show_indices = indices.flatten()[1:]
    recommended_shows = shows_2.loc[recommended_show_indices, "name"]
    return recommended_shows


### Recommendations v2

In [16]:
# KNN: Make recommendation v2
input_show = "Game of Thrones"
recommendations = knn_recommend_similar_shows_2(input_show)
print(f"KNN Model recommended shows similar to '{input_show}':")
print(recommendations)

KNN Model recommended shows similar to 'Game of Thrones':
3760                 The Mandalorian
2670                       The Flash
4472                  Binge Reloaded
4065               Dog's Most Wanted
4461    Palmashow - Very Bad Blagues
Name: name, dtype: object
