In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from pathlib import Path
sns.set()


In [2]:
ratings = pd.read_csv("../data/raw/ratings.csv")
movies = pd.read_csv("../data/raw/movies.csv")
tags = pd.read_csv("../data/raw/tags.csv")

In [3]:
# --- Uklanjanje filmova bez žanra ---
movies = movies[movies['genres'] != '(no genres listed)'].copy()

In [4]:
# --- Statistika ocena po filmu ---
rating_stats = ratings.groupby('movieId').agg(
    mean_rating=('rating', 'mean'),
    rating_count=('rating', 'count')
).reset_index()

movies = pd.merge(movies, rating_stats, on='movieId', how='left')
movies['rating_count'] = movies['rating_count'].fillna(0).astype(int)
movies['mean_rating']  = movies['mean_rating'].astype(float)

In [5]:
# --- Čišćenje i grupisanje tagova ---
def clean_tag(t: str) -> str:
    t = str(t).lower().strip()
    return " ".join(t.split())

if not tags.empty:
    tags['tag'] = tags['tag'].astype(str).apply(clean_tag)
    tags = tags[tags['tag'] != '']  # izbaci prazne
    tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: " ".join(x)).reset_index()
else:
    tags_grouped = pd.DataFrame({'movieId': [], 'tag': []})

movies = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies['tag'] = movies['tag'].fillna('')

In [6]:
movies['genres_tok'] = movies['genres'].str.replace('|', ' ', regex=False).str.lower()

In [7]:
# --- Kreiranje dummy (one-hot) kolona za žanrove ---
genre_dummies = movies['genres_tok'].str.get_dummies(sep=' ')

# Dodaj prefiks da bude jasno da su to žanrovi
genre_dummies = genre_dummies.add_prefix('genre_')

# Spoji dummy kolone nazad sa glavnim DataFrame-om
movies = pd.concat([movies, genre_dummies], axis=1)

print(f"Kreirano {genre_dummies.shape[1]} žanrovskih dummy kolona")

Kreirano 19 žanrovskih dummy kolona


In [8]:
# Dropujemo redundantne kolone
movies = movies.drop(columns=['genres', 'genres_tok'], errors='ignore')

In [9]:
output_path = Path("../data/processed/movies_merged.parquet")
movies.to_parquet(output_path, index=False)
print("Sačuvano:", output_path)

Sačuvano: ..\data\processed\movies_merged.parquet


In [10]:
print(movies.shape)
movies.head()

(9708, 24)


Unnamed: 0,movieId,title,mean_rating,rating_count,tag,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,...,genre_film-noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_thriller,genre_war,genre_western
0,1,Toy Story (1995),3.92093,215,pixar pixar fun,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),3.431818,110,fantasy magic board game robin williams game,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),3.259615,52,moldy old,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),2.357143,7,,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),3.071429,49,pregnancy remake,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
