In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
df = pd.read_csv("C:/Users/Sergiu/Desktop/Big Data Project/dataset/dataset.csv")

df["genres"] = df["genres"].fillna("")
df["summary"] = df["summary"].fillna("")

if "content" not in df.columns:
    df["content"] = df["genres"] + " " + df["summary"]

df.head()


Unnamed: 0,name,genres,summary,rating_value,release_year,content
0,Carol Burnett & Company,Comedy,"Music, songs, and comedy sketches.",,1979.0,"Comedy Music, songs, and comedy sketches."
1,Carla Cametti PD,,This six-part Australian crime series is cente...,,2009.0,This six-part Australian crime series is cent...
2,The Carol Burnett Show,"Comedy, Music",CBS brought back The Carol Burnett Show for an...,,1991.0,"Comedy, Music CBS brought back The Carol Burne..."
3,Carrier,,A character-driven immersion in the high-stake...,,2008.0,A character-driven immersion in the high-stak...
4,Carnival Cravings with Anthony Anderson,Food,There isn't much you can't wrap with bacon or ...,,2015.0,Food There isn't much you can't wrap with baco...


In [None]:
preferred_genres = input("🎭 Preferred genres? (comma-separated, e.g. Drama, Thriller): ").lower().split(",")
min_year = int(input("📅 Earliest release year? (e.g. 2010): "))
min_rating = float(input("⭐ Minimum rating (0–10)? (e.g. 7.0): "))
keywords = input("🔍 Keywords/themes you enjoy? (comma-separated, e.g. revenge, family, heist): ").lower().split(",")


🎭 Preferred genres? (comma-separated, e.g. Drama, Thriller):  History
📅 Earliest release year? (e.g. 2010):  1999
⭐ Minimum rating (0–10)? (e.g. 7.0):  0
🔍 Keywords/themes you enjoy? (comma-separated, e.g. revenge, family, heist):  family


In [None]:
filtered_df = df.copy()

# Filter by release year
filtered_df = filtered_df[filtered_df["release_year"] >= min_year]

# Filter by preferred genres
filtered_df = filtered_df[filtered_df["genres"].str.lower().apply(
    lambda g: any(genre.strip() in g for genre in preferred_genres)
)]

# Include shows that either meet the rating or don't have one
if "rating_value" in filtered_df.columns:
    filtered_df = filtered_df[
        (filtered_df["rating_value"].isna()) |
        (filtered_df["rating_value"] >= min_rating)
    ]

# Boost keyword relevance by appending them to content
for keyword in keywords:
    filtered_df["content"] = filtered_df["content"] + " " + keyword


In [None]:
# Vectorize the content column
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(filtered_df["content"])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Reset index and prepare title lookup
filtered_df = filtered_df.reset_index(drop=True)
indices = pd.Series(filtered_df.index, index=filtered_df["name"].str.lower())


In [None]:
def recommend(show_title, num_recommendations=5):
    show_title = show_title.lower()

    if show_title not in indices:
        return f"❌ '{show_title}' not found in the filtered dataset."

    idx = indices[show_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations + 1]
    show_indices = [i[0] for i in sim_scores]

    return filtered_df[["name", "genres", "rating_value", "summary"]].iloc[show_indices]


In [None]:
user_title = input("🎬 Enter a TV show you like: ")
results = recommend(user_title, num_recommendations=5)

if isinstance(results, str):
    print(results)
else:
    for i, row in results.iterrows():
        print(f"\n🎬 {row['name']}")
        print(f"   📚 Genres: {row['genres']}")
        print(f"   ⭐ Rating: {row['rating_value']}")
        print(f"   📝 Summary: {row['summary'][:300]}...\n")


🎬 Enter a TV show you like:  Nostradamus: End of Days



🎬 Ancient Impossible
   📚 Genres: History
   ⭐ Rating: nan
   📝 Summary: Ancient Impossible, the H2 series, picks up where History's long running "Ancient Discoveries" left off. In this next generation of storytelling, Ancient Impossible reveals how many of today's technological achievements were actually developed centuries ago. Colossal monuments, impossible feats of e...


🎬 Hunting Nazi Treasure
   📚 Genres: History
   ⭐ Rating: nan
   📝 Summary: This exciting series chronicles one of the greatest heists in history and the present-day efforts to locate valuable objects and artwork stolen by the Nazis at the end of World War II and return them to their rightful owners. With hundreds of thousands of items worth billions still missing to this d...


🎬 Curse of the Ancients with Alice Roberts
   📚 Genres: History
   ⭐ Rating: nan
   📝 Summary: In this exciting five-part history series, Curse of the Ancients Professor Alice Roberts reveals how scientists are unearthing the evidence for