In [7]:
import pandas as pd

# Load movie data
movies = pd.read_csv('../data/movies.dat', 
                     sep='::', 
                     engine='python', 
                     names=['movieId', 'title', 'genres'],
                     encoding='latin-1')

# Load ratings data
ratings = pd.read_csv('../data/ratings.dat', 
                      sep='::', 
                      engine='python', 
                      names=['userId', 'movieId', 'rating', 'timestamp'],
                      encoding='latin-1')

# Load user data
users = pd.read_csv('../data/users.dat', 
                    sep='::', 
                    engine='python', 
                    names=['userId', 'gender', 'age', 'occupation', 'zipCode'],
                    encoding='latin-1')

# Display top rows
print("🎬 Movies:")
print(movies.head(), '\n')

print("⭐ Ratings:")
print(ratings.head(), '\n')

print("👤 Users:")
print(users.head())


🎬 Movies:
   movieId                               title                        genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy 

⭐ Ratings:
   userId  movieId  rating  timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291 

👤 Users:
   userId gender  age  occupation zipCode
0       1      F    1          10   48067
1       2      M   56          16   70072
2       3      M   25          15   55117
3       4      M   45           7   02460
4       5      M   25          20   55455


In [2]:
!pip install pandas


Collecting pandas
  Using cached pandas-2.3.1-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.3.2-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.1-cp313-cp313-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/11.0 MB 1.8 MB/s eta 0:00:06
   ---- ----------------------------------- 1.3/11.0 MB 2.4 MB/s eta 0:00:04
   ------ --------------------------------- 1.8/11.0 MB 2.6 MB/s eta 0:00:04
   ---------- ----------------------------- 2.9/11.0 MB 3.1 MB/s eta 0:00:03
   -----------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import numpy as np

# 1. Convert timestamps to readable dates
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')

# 2. Extract year from movie title (e.g. "Toy Story (1995)")
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)

# 3. Create a list of genres per movie
movies['genre_list'] = movies['genres'].str.split('|')

# 4. Merge ratings with movie info
ratings_with_movies = pd.merge(ratings, movies, on='movieId')

# 5. Optional: Merge user data for future filtering (age, gender, etc.)
full_data = pd.merge(ratings_with_movies, users, on='userId')

# 6. Quick checks
print("✅ Merged Ratings with Movie Info:")
print(ratings_with_movies.head(), '\n')

print("✅ Merged with User Info (full_data):")
print(full_data.head(), '\n')

print("🧹 Missing Values Summary:")
print(full_data.isnull().sum())


✅ Merged Ratings with Movie Info:
   userId  movieId  rating  timestamp            datetime  \
0       1     1193       5  978300760 2000-12-31 22:12:40   
1       1      661       3  978302109 2000-12-31 22:35:09   
2       1      914       3  978301968 2000-12-31 22:32:48   
3       1     3408       4  978300275 2000-12-31 22:04:35   
4       1     2355       5  978824291 2001-01-06 23:38:11   

                                    title                        genres  \
0  One Flew Over the Cuckoo's Nest (1975)                         Drama   
1        James and the Giant Peach (1996)  Animation|Children's|Musical   
2                     My Fair Lady (1964)               Musical|Romance   
3                  Erin Brockovich (2000)                         Drama   
4                    Bug's Life, A (1998)   Animation|Children's|Comedy   

     year                        genre_list  
0  1975.0                           [Drama]  
1  1996.0  [Animation, Children's, Musical]  
2  1964.0 

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Replace '|' with space so TF-IDF can tokenize properly
movies['genres_cleaned'] = movies['genres'].str.replace('|', ' ', regex=False)

# Step 2: Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres_cleaned'])

# Step 3: Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 4: Create reverse mapping of movie title to index
movie_indices = pd.Series(movies.index, index=movies['title'])

# Step 5: Recommendation function
def recommend_movies(title, num_recommendations=10):
    if title not in movie_indices:
        return f"'{title}' not found in database."

    idx = movie_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices_rec = [i[0] for i in sim_scores]
    return movies[['title', 'genres']].iloc[movie_indices_rec]

# Step 6: Try it out
recommend_movies("Toy Story (1995)")


Unnamed: 0,title,genres
1050,Aladdin and the King of Thieves (1996),Animation|Children's|Comedy
2072,"American Tail, An (1986)",Animation|Children's|Comedy
2073,"American Tail: Fievel Goes West, An (1991)",Animation|Children's|Comedy
2285,"Rugrats Movie, The (1998)",Animation|Children's|Comedy
2286,"Bug's Life, A (1998)",Animation|Children's|Comedy
3045,Toy Story 2 (1999),Animation|Children's|Comedy
3542,Saludos Amigos (1943),Animation|Children's|Comedy
3682,Chicken Run (2000),Animation|Children's|Comedy
3685,"Adventures of Rocky and Bullwinkle, The (2000)",Animation|Children's|Comedy
12,Balto (1995),Animation|Children's


In [10]:
!pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.7.1-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.1-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.7 MB 3.1 MB/s eta 0:00:03
   ------ --------------------------------- 1.3/8.7 MB 3.7 MB/s eta 0:00:02
   ---------- ----------------------------- 2.4/8.7 MB 4.3 MB/s eta 0:00:02
   --------------- ------------------------ 3.4/8.7 MB 4.6 MB/s eta 0:00:02
   --------------------- ------------------ 4.7/8.7 MB 4.9 MB/s eta 0:00:01
   ---------------------------- -----------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import pandas as pd

# Step 1: Load data into Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Step 2: Train/test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Step 3: Build and train SVD model
model = SVD()
model.fit(trainset)

# Step 4: Function to get top N movie recommendations for a user
def get_collab_recommendations(user_id, num_recs=10):
    # Get all movieIds
    movie_ids = ratings['movieId'].unique()
    # Filter out movies already rated by this user
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    unseen = [mid for mid in movie_ids if mid not in rated_movies]

    # Predict ratings for unseen movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unseen]
    # Sort by estimated rating
    predictions.sort(key=lambda x: x.est, reverse=True)

    top_movie_ids = [pred.iid for pred in predictions[:num_recs]]
    return movies[movies['movieId'].isin(top_movie_ids)][['title', 'genres']]

# Step 5: Try it
get_collab_recommendations(user_id=1)


ModuleNotFoundError: No module named 'surprise'

In [13]:
!pip install scikit-surprise


Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  Getting requirements to build wheel did not run successfully.
  exit code: 1
  
  [45 lines of output]
  
  Error compiling Cython file:
  ------------------------------------------------------------
  ...
          self.avg_cltr_i = avg_cltr_i
          self.avg_cocltr = avg_cocltr
  
          return self
  
      def compute_averages(self, np.ndarray[np.int_t] cltr_u,
                                               ^
  ------------------------------------------------------------
  
  surprise\prediction_algorithms\co_clustering.pyx:157:45: Invalid type.
  Compiling surprise/similarities.pyx because it changed.
  Compiling surprise/prediction_algorithms/matrix_factorization.pyx because it changed.
  Compiling surprise/prediction_algorithms/optimize_baselines.pyx because it changed.
  Compiling surprise/prediction_algorithms/slope_one.pyx because it changed.
  Compiling surprise/prediction_algorithms/co_clustering.pyx because it changed.
  [1/

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a pivot table: users as rows, movies as columns
user_movie_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')

# Fill missing ratings with 0
user_movie_matrix_filled = user_movie_matrix.fillna(0)

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix_filled)

# Convert similarity to DataFrame
user_sim_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Function to recommend movies based on similar users
def user_based_recommendations(user_id, num_recommendations=10):
    if user_id not in user_sim_df.index:
        return "User not found."

    # Find most similar users
    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:]

    # Get movies watched by similar users, not yet watched by target user
    target_user_movies = set(ratings[ratings['userId'] == user_id]['movieId'])
    similar_users_ids = similar_users.index.tolist()

    movie_scores = {}

    for sim_user in similar_users_ids:
        sim_user_ratings = ratings[ratings['userId'] == sim_user]
        for _, row in sim_user_ratings.iterrows():
            if row['movieId'] not in target_user_movies:
                movie_scores.setdefault(row['movieId'], []).append(row['rating'])

    # Average the scores and sort
    averaged_scores = {movie: sum(ratings)/len(ratings) for movie, ratings in movie_scores.items()}
    recommended_movie_ids = sorted(averaged_scores.items(), key=lambda x: x[1], reverse=True)[:num_recommendations]

    top_movie_ids = [movie_id for movie_id, _ in recommended_movie_ids]
    return movies[movies['movieId'].isin(top_movie_ids)][['title', 'genres']]

# Try it!
user_based_recommendations(user_id=1)


Unnamed: 0,title,genres
777,"Gate of Heavenly Peace, The (1995)",Documentary
977,Schlafes Bruder (Brother of Sleep) (1995),Drama
1762,Follow the Bitch (1998),Comedy
3103,Ulysses (Ulisse) (1954),Adventure
3164,Smashing Time (1967),Comedy
3211,"Baby, The (1973)",Horror
3313,Song of Freedom (1936),Drama
3538,One Little Indian (1973),Comedy|Drama|Western
3587,Lured (1947),Crime
3811,Bittersweet Motel (2000),Documentary


In [16]:
user_based_recommendations(1)


Unnamed: 0,title,genres
777,"Gate of Heavenly Peace, The (1995)",Documentary
977,Schlafes Bruder (Brother of Sleep) (1995),Drama
1762,Follow the Bitch (1998),Comedy
3103,Ulysses (Ulisse) (1954),Adventure
3164,Smashing Time (1967),Comedy
3211,"Baby, The (1973)",Horror
3313,Song of Freedom (1936),Drama
3538,One Little Indian (1973),Comedy|Drama|Western
3587,Lured (1947),Crime
3811,Bittersweet Motel (2000),Documentary
