In [59]:
# Import
import pandas as pd

df_ratings = pd.read_csv('movies_ratings.csv')
df_titles = pd.read_csv('movies_titles.csv')
df_users = pd.read_csv('movies_users.csv')

In [60]:
df_titles.drop('director', axis=1, inplace=True)
df_titles.dropna(inplace=True)
df_ratings = df_ratings[df_ratings['show_id'].isin(df_titles['show_id'])]

#### Genre vectorization

In [61]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Set up genre matrix
# Reset index so similarity lookup matches
df_titles = df_titles.reset_index(drop=True)

# Now it's safe to use row index for similarity lookup
genre_cols = df_titles.loc[:, "Action":"Thrillers"]
genre_matrix = genre_cols.to_numpy()
similarity_matrix = cosine_similarity(genre_matrix)

# Compute cosine similarity across all titles
similarity_matrix = cosine_similarity(genre_matrix)

# Function: recommend similar movies based on title
def recommend_similar(title, top_n=10):
    target_row = df_titles[df_titles["title"] == title]
    if target_row.empty:
        return []
    idx = target_row.index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1:top_n+1]]
    return df_titles.iloc[top_indices][["title", "show_id"]]

# Example usage
recommend_similar("The Super Mario Bros. Super Show!")

Unnamed: 0,title,show_id
40,Numberblocks,s66
56,Octonauts: Above & Beyond,s99
59,Tayo the Little Bus,s105
65,Sharkdog,s112
72,Pororo - The Little Penguin,s125
95,Kid-E-Cats,s154
155,Go! Go! Cory Carson,s255
157,Mother Goose Club,s264
162,Winx Club,s270
164,Fast & Furious Spy Racers,s273


In [58]:
def extract_genres(df):
    # Columns that are definitely not genres
    non_genre_cols = [
        'show_id', 'type', 'title', 'cast', 'country',
        'release_year', 'rating', 'duration', 'description'
    ]

    # Get all int64 columns that aren't in non-genre list
    genre_cols = [col for col in df.select_dtypes(include='int64').columns if col not in non_genre_cols]

    # Combine active genres into a string
    df['genres'] = df[genre_cols].apply(
        lambda row: ', '.join([col for col in genre_cols if row[col] == 1]), axis=1
    )

    return df

In [None]:
df_titles = extract_genres(df_titles)
df_titles.head()

                           title                 genres
0           Dick Johnson Is Dead          Documentaries
1                  Blood & Water      Dramas, TV Dramas
2                   Kota Factory  Comedies, TV Comedies
3                        Sankofa                 Dramas
4  The Great British Baking Show             Reality TV


In [67]:
recs = []

for idx, row in df_titles.iterrows():
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]  # skip itself
    for sim_idx, score in sim_scores:
        recs.append({
            "show_id": row["show_id"],
            "recommended_id": df_titles.iloc[sim_idx]["show_id"],
            "genre": df_titles.iloc[sim_idx]["genres"],
            "score": round(score, 4)
        })

df_recs = pd.DataFrame(recs)

In [68]:
import sqlite3

conn = sqlite3.connect("movie_recommendations_titles.db")

# Save titles table (optional if already in DB)
df_titles.to_sql("genres_recs", conn, if_exists="replace", index=False)

# Save the recommendations
df_recs.to_sql("genre_recommendations", conn, if_exists="replace", index=False)

76130

### Genre clustering

In [31]:
genre_cols = df_titles.loc[:, "Action":"Thrillers"]
ratings_encoded = pd.get_dummies(df_titles["rating"], prefix="rating")

features = pd.concat([genre_cols, ratings_encoded], axis=1)
feature_matrix = features.to_numpy()

# Step 3: Compute cosine similarity between all titles
similarity_matrix = cosine_similarity(feature_matrix)

In [32]:
recs = []

for idx, row in df_titles.iterrows():
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:11]  # skip self
    for sim_idx, score in sim_scores:
        recs.append({
            "show_id": row["show_id"],
            "recommended_id": df_titles.iloc[sim_idx]["show_id"],
            "score": round(score, 4)
        })

df_recs = pd.DataFrame(recs)

In [33]:
# Connect to SQLite
conn = sqlite3.connect("movie_recommendations_genres.db")

# Save titles (optional if already saved elsewhere)
df_titles.to_sql("movies_titles", conn, if_exists="replace", index=False)

# Save recommendations
df_recs.to_sql("content_recommendations", conn, if_exists="replace", index=False)

conn.close()

In [34]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import pandas as pd

# Convert your ratings DataFrame to Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_ratings[["user_id", "show_id", "rating"]], reader)

# Train on the whole set (since it's pre-login)
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x129db24e0>

In [35]:
# Get all show_ids
all_movie_ids = df_titles["show_id"].unique()

# Predict ratings for a dummy user_id (e.g. 'guest')
predictions = [(movie_id, model.predict("guest", movie_id).est) for movie_id in all_movie_ids]

# Sort by estimated rating
top_recs = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]

# Join with titles
top_movie_ids = [movie_id for movie_id, _ in top_recs]
top_titles = df_titles[df_titles["show_id"].isin(top_movie_ids)][["title", "show_id"]]

In [36]:
top_titles

Unnamed: 0,title,show_id
36,Naruto Shippuden: The Movie: The Lost Tower,s61
96,Kuroko's Basketball,s155
692,Durarara!!,s1041
1533,Challenger,s1991
2027,Parasyte: The Maxim,s2526
3292,Traitors,s3970
4401,Equestria Girls: Tales of Canterlot High,s5238
4654,Tramps,s5525
4872,DEATH NOTE,s5752
5132,A Night at the Roxbury,s6050


In [37]:
guest_recs = []
for mid in df_titles["show_id"].unique():
    pred = model.predict("guest", mid).est
    guest_recs.append((mid, pred))

# Top N overall
top = sorted(guest_recs, key=lambda x: x[1], reverse=True)[:50]
top_df = df_titles[df_titles["show_id"].isin([mid for mid, _ in top])]
top_df["genre"] = "TopPicks"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_df["genre"] = "TopPicks"


In [39]:
genre_recs = []
for genre in ["Action", "Adventure", "Children", "Comedies", "Documentaries", "Dramas", 
 "Fantasy", "Horror Movies", "Musicals", "Reality TV", "Thrillers"]:
    genre_ids = df_titles[df_titles[genre] == 1]["show_id"]
    genre_top = [r for r in guest_recs if r[0] in genre_ids][:10]
    ids = [mid for mid, _ in genre_top]
    genre_df = df_titles[df_titles["show_id"].isin(ids)].copy()
    genre_df["genre"] = genre
    genre_recs.append(genre_df)

final_df = pd.concat([top_df] + genre_recs)

In [41]:
conn = sqlite3.connect("recommendations_collab_b4Logon.db")
final_df[["genre", "title", "show_id"]].to_sql("Recommendations", conn, if_exists="replace", index=False)

50