<a href="https://colab.research.google.com/github/OskarSko/SztucznaInteligencja_20538/blob/main/Rekomendacja_filmow_projekt_20516_20538.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import pandas as pd

movies_df = pd.read_csv('movies.csv')   # kolumny: movieId, title, genres
ratings_df = pd.read_csv('ratings.csv') # kolumny: userId, movieId, rating, timestamp

print("Liczba filmów:", len(movies_df))
print("Przykładowe filmy:\n", movies_df.head(10))
print("Liczba ocen:", len(ratings_df))
print("Przykładowe oceny:\n", ratings_df.head(10))

Liczba filmów: 9742
Przykładowe filmy:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   
5        6                         Heat (1995)   
6        7                      Sabrina (1995)   
7        8                 Tom and Huck (1995)   
8        9                 Sudden Death (1995)   
9       10                    GoldenEye (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
5                        Action|Crime|Thriller  
6                               Comedy|Romance  
7                

In [27]:
# Przygotwanie bazy uzytkownikow
user_likes = {}
like_threshold = 4.0 # dolny limit oceny

for user_id, group in ratings_df.groupby('userId'):
    liked_movies = set(group[group['rating'] >= like_threshold]['movieId'])
    user_likes[user_id] = liked_movies

# Funkcja podobienstwa pomiedzy dwoma bazami
import math
def cosine_similarity(set1, set2):
    if not set1 or not set2:
        return 0.0
    intersection_size = len(set1.intersection(set2))
    return intersection_size / math.sqrt(len(set1) * len(set2))

# Funkcja generująca rekomendacje na podstawie uztkownikow
def get_collaborative_recommendations(fav_movies_ids, top_neighbors=10):
    # Oblicz podobieństwo nowego użytkownika do każdego istniejącego użytkownika
    fav_set = set(fav_movies_ids)
    sims = []
    for user_id, liked in user_likes.items():
        sim = cosine_similarity(fav_set, liked)
        if sim > 0:  # bierzemy pod uwagę tylko użytkowników mających cokolwiek wspólnego
            sims.append((user_id, sim))
    sims.sort(key=lambda x: x[1], reverse=True)
    top_neighbors = min(top_neighbors, len(sims))
    top_users = [user_id for (user_id, sim) in sims[:top_neighbors]]

    # Zbior filmow nieznanych przez uzytkownika
    reco_scores = {}
    for user_id, sim in sims[:top_neighbors]:
        for movie_id in user_likes[user_id]:
            if movie_id in fav_set:
                continue
            reco_scores[movie_id] = reco_scores.get(movie_id, 0) + sim
    return reco_scores

In [29]:
# Przygotwanie bazy gatunkow
all_genres = set()
for genres_str in movies_df['genres']:
    for genre in genres_str.split('|'):
        all_genres.add(genre)
all_genres = sorted(all_genres)
print("Gatunki filmów w zbiorze:", all_genres)

genre_to_index = {genre: idx for idx, genre in enumerate(all_genres)}

movie_genre_vec = {}
for idx, row in movies_df.iterrows():
    mid = row['movieId']
    genres_str = row['genres']
    genre_vector = [0] * len(all_genres)
    for genre in genres_str.split('|'):
        if genre in genre_to_index:
            genre_vector[genre_to_index[genre]] = 1
    movie_genre_vec[mid] = genre_vector

import numpy as np

def build_user_profile(fav_movies_ids):
    profile_vec = np.zeros(len(all_genres))
    for mid in fav_movies_ids:
        if mid in movie_genre_vec:
            profile_vec += np.array(movie_genre_vec[mid])
    return profile_vec

# Funkcja do obliczenia podobieństwa użytkownika do filmu na podstawie gatunków
def content_similarity(profile_vec, movie_id):
    if movie_id not in movie_genre_vec:
        return 0.0
    movie_vec = np.array(movie_genre_vec[movie_id])
    dot = profile_vec.dot(movie_vec)
    normA = np.linalg.norm(profile_vec)
    normB = np.linalg.norm(movie_vec)
    if normA == 0 or normB == 0:
        return 0.0
    return dot / (normA * normB)

def get_content_recommendations(fav_movies_ids, top_n=10):
    profile_vec = build_user_profile(fav_movies_ids)
    scores = {}
    for mid, vec in movie_genre_vec.items():
        if mid in fav_movies_ids:
            continue
        sim = content_similarity(profile_vec, mid)
        if sim > 0:
            scores[mid] = sim
    if top_n is not None:
        top_n_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
        scores = dict(top_n_movies)
    return scores

Gatunki filmów w zbiorze: ['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [30]:
def hybrid_recommendations(fav_movies_ids, top_n=10, w_cf=0.5, w_cb=0.5, neighbor_count=10):

    # Uzycie funkcji
    collab_scores = get_collaborative_recommendations(fav_movies_ids, top_neighbors=neighbor_count)
    content_scores = get_content_recommendations(fav_movies_ids, top_n=None)

    # Normalizacja wynikow
    if collab_scores:
        max_cf = max(collab_scores.values())
    else:
        max_cf = 0
    if content_scores:
        max_cb = max(content_scores.values())
    else:
        max_cb = 0

    # Tworzenie bazy z dwoma funkcjami
    candidate_movies = set(collab_scores.keys()) | set(content_scores.keys())
    candidate_movies -= set(fav_movies_ids)

    # Wynik w dwoch kategoriach
    hybrid_scores = []
    for mid in candidate_movies:
        cf_norm = collab_scores.get(mid, 0) / max_cf if max_cf > 0 else 0
        cb_norm = content_scores.get(mid, 0) / max_cb if max_cb > 0 else 0
        total_score = w_cf * cf_norm + w_cb * cb_norm
        hybrid_scores.append((mid, total_score))
    hybrid_scores.sort(key=lambda x: x[1], reverse=True)
    return hybrid_scores[:top_n]


In [34]:
# Funkcja pomocnicza do wyszukiwania movieId po tytule
def find_movie_id_by_partial_title(user_input_title):
    cleaned_input = user_input_title.strip().lower()

    for idx, row in movies_df.iterrows():
        full_title = row['title']
        title_only = full_title.split('(')[0].strip().lower()

        if cleaned_input in title_only:
            return int(row['movieId']), full_title

    return None, None




# Przykładowe ulubione filmy
favorite_titles = ["Howl's Moving Castle", "akira", "Ghost in the Shell", "My Neighbor Totoro"]




# Konwersja tytułów na id
favorite_ids = []
matched_titles = []

for title in favorite_titles:
    mid, matched_title = find_movie_id_by_partial_title(title)
    if mid:
        favorite_ids.append(mid)
        matched_titles.append(matched_title)
        print(f"✅ Dopasowano: '{title}' → '{matched_title}' (ID: {mid})")
    else:
        print(f"⚠️ Film '{title}' nie został znaleziony w bazie.")

print("\nUlubione filmy (movieId):", favorite_ids)

# Generowanie rekomendacji hybrydowych
recommendations = hybrid_recommendations(favorite_ids, top_n=10, w_cf=0.5, w_cb=0.5, neighbor_count=10)

print("\n🎬 Top rekomendacje dla użytkownika lubiącego:")
for title in matched_titles:
    print(f"• {title}")

print("\n📽️ Polecane filmy:")
for rank, (mid, score) in enumerate(recommendations, start=1):
    title = movies_df[movies_df['movieId'] == mid]['title'].values[0]
    print(f"{rank}. {title} (score={score:.3f})")

✅ Dopasowano: 'Howl's Moving Castle' → 'Howl's Moving Castle (Hauru no ugoku shiro) (2004)' (ID: 31658)
✅ Dopasowano: 'akira' → 'Akira (1988)' (ID: 1274)
✅ Dopasowano: 'Ghost in the Shell' → 'Ghost in the Shell (Kôkaku kidôtai) (1995)' (ID: 741)
✅ Dopasowano: 'My Neighbor Totoro' → 'My Neighbor Totoro (Tonari no Totoro) (1988)' (ID: 5971)

Ulubione filmy (movieId): [31658, 1274, 741, 5971]

🎬 Top rekomendacje dla użytkownika lubiącego:
• Howl's Moving Castle (Hauru no ugoku shiro) (2004)
• Akira (1988)
• Ghost in the Shell (Kôkaku kidôtai) (1995)
• My Neighbor Totoro (Tonari no Totoro) (1988)

📽️ Polecane filmy:
1. Spirited Away (Sen to Chihiro no kamikakushi) (2001) (score=0.962)
2. Nausicaä of the Valley of the Wind (Kaze no tani no Naushika) (1984) (score=0.802)
3. Princess Mononoke (Mononoke-hime) (1997) (score=0.675)
4. Laputa: Castle in the Sky (Tenkû no shiro Rapyuta) (1986) (score=0.666)
5. Lord of the Rings: The Return of the King, The (2003) (score=0.616)
6. Eternal Sunshine 

In [39]:
def find_movie_id_by_partial_title(user_input_title):
    cleaned_input = user_input_title.strip().lower()
    for idx, row in movies_df.iterrows():
        full_title = row['title']
        title_only = full_title.split('(')[0].strip().lower()
        if cleaned_input in title_only:
            return int(row['movieId']), full_title
    return None, None


def get_favorite_titles_from_user():
    titles = []
    print("🔤 Wpisuj tytuły filmów (wpisz pusty ENTER, aby zakończyć):")
    while True:
        title = input("Tytuł: ").strip()
        if title == "":
            break
        titles.append(title)
    return titles


def get_favorite_titles_from_file(filename):
    titles = []
    try:
        with open(filename, "r", encoding="utf-8") as f:
            titles = [line.strip() for line in f if line.strip()]
    except FileNotFoundError:
        print(f"❌ Nie znaleziono pliku '{filename}'.")
    return titles

print("🎬 Jak chcesz podać ulubione filmy?")
print("1. Wpiszę ręcznie")
print("2. Wczytaj z pliku (np. moje_filmy.txt)")
choice = input("Wybór (1 lub 2): ").strip()

favorite_titles = []
if choice == "1":
    favorite_titles = get_favorite_titles_from_user()
elif choice == "2":
    filename = input("📂 Podaj nazwę pliku (np. ulubione.txt): ").strip()
    favorite_titles = get_favorite_titles_from_file(filename)
else:
    print("❌ Nieprawidłowy wybór.")

try:
    min_rating = float(input("\n🔎 Podaj minimalną ocenę filmu (np. 3.5): "))
except:
    print("⚠️ Błąd wejścia – ustawiono domyślnie 0.")
    min_rating = 0.0

favorite_ids = []
matched_titles = []

for title in favorite_titles:
    mid, matched_title = find_movie_id_by_partial_title(title)
    if mid:
        favorite_ids.append(mid)
        matched_titles.append(matched_title)
        print(f"✅ Dopasowano: '{title}' → '{matched_title}' (ID: {mid})")
    else:
        print(f"⚠️ Nie znaleziono filmu '{title}' w bazie.")

if favorite_ids:
    recommendations = hybrid_recommendations(
        favorite_ids, top_n=50, w_cf=0.5, w_cb=0.5, neighbor_count=10
    )

    avg_ratings = ratings_df.groupby("movieId")["rating"].mean().to_dict()

    filtered_recommendations = []
    for mid, score in recommendations:
        avg = avg_ratings.get(mid, 0)
        if avg >= min_rating:
            filtered_recommendations.append((mid, score, avg))

    print("\n🎯 Top rekomendacje dla użytkownika lubiącego:")
    for title in matched_titles:
        print(f"• {title}")

    print(f"\n📽️ Polecane filmy (średnia ocena ≥ {min_rating}):")
    for rank, (mid, score, avg) in enumerate(filtered_recommendations[:10], start=1):
        title = movies_df[movies_df["movieId"] == mid]["title"].values[0]
        print(f"{rank}. {title} (score={score:.3f}, avg_rating={avg:.2f})")
else:
    print("\n❌ Nie znaleziono żadnych dopasowanych filmów. Nie można wygenerować rekomendacji.")


🎬 Jak chcesz podać ulubione filmy?
1. Wpiszę ręcznie
2. Wczytaj z pliku (np. moje_filmy.txt)
Wybór (1 lub 2): 2
📂 Podaj nazwę pliku (np. ulubione.txt): ulubione_filmy.txt

🔎 Podaj minimalną ocenę filmu (np. 3.5): 3.0
✅ Dopasowano: 'Toy Story' → 'Toy Story (1995)' (ID: 1)
✅ Dopasowano: '10 cent' → '10 Cent Pistol (2015)' (ID: 139717)
✅ Dopasowano: 'Grumpier Old Men' → 'Grumpier Old Men (1995)' (ID: 3)
⚠️ Nie znaleziono filmu 'Spider-Man (2002)' w bazie.

🎯 Top rekomendacje dla użytkownika lubiącego:
• Toy Story (1995)
• 10 Cent Pistol (2015)
• Grumpier Old Men (1995)

📽️ Polecane filmy (średnia ocena ≥ 3.0):
1. Independence Day (a.k.a. ID4) (1996) (score=0.608, avg_rating=3.45)
2. Twelve Monkeys (a.k.a. 12 Monkeys) (1995) (score=0.601, avg_rating=3.98)
3. Willy Wonka & the Chocolate Factory (1971) (score=0.587, avg_rating=3.87)
4. Twister (1996) (score=0.576, avg_rating=3.32)
5. Happy Gilmore (1996) (score=0.542, avg_rating=3.44)
6. Toy Story 2 (1999) (score=0.533, avg_rating=3.86)
7. M