<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/Factorization_Machine_Recommendation_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import os
import requests
import zipfile
import io

# --- 0. Dataset Download Function ---
def download_and_extract_movielens(url="http://files.grouplens.org/datasets/movielens/ml-100k.zip", target_dir="ml-100k"):
    """
    Downloads the MovieLens 100K dataset and extracts it to a specified directory.
    """
    if os.path.exists(target_dir):
        print(f"Dataset directory '{target_dir}' already exists. Skipping download.")
        return

    print(f"Downloading MovieLens 100K dataset from {url}...")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status() # Raise an exception for bad status codes

        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            z.extractall(target_dir)
        print(f"Dataset downloaded and extracted to '{target_dir}'.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading dataset: {e}")
        print("Please check your internet connection or the URL.")
        exit()
    except zipfile.BadZipFile as e:
        print(f"Error extracting zip file: {e}")
        print("The downloaded file might be corrupted.")
        exit()
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        exit()

# --- 1. Data Loading and Preprocessing ---


# Automatically download and extract the dataset if it's not present
download_and_extract_movielens()

Dataset directory 'ml-100k' already exists. Skipping download.


In [19]:
# !ls ml-100k/ml-100k
# # os.path.exists

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [20]:
# Define paths to dataset files
ratings_path = 'ml-100k/ml-100k/u.data'
movies_path = 'ml-100k/ml-100k/u.item'
users_path = 'ml-100k/ml-100k/u.user' # Although not explicitly used for features in this FM, useful for context

# Check if dataset files exist after attempted download
if not os.path.exists(ratings_path) or not os.path.exists(movies_path):
    print("Dataset files still not found after attempting download. Exiting.")
    exit()

# Load ratings data: user_id | movie_id | rating | timestamp
ratings_df = pd.read_csv(ratings_path, sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load movie data: movie_id | movie_title | release_date | video_release_date | IMDb_URL | genre_flags (19 binary)
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + \
             ['genre_' + str(i) for i in range(19)]
movies_df = pd.read_csv(movies_path, sep='|', names=movie_cols, encoding='latin-1')


In [21]:
# --- 2. Feature Engineering: Create a Unified Feature Space ---
# Factorization Machines require all features (user, item, genre) to be mapped to a single
# continuous index space. We'll also maintain "field" information for features.

# Get unique values for each feature type
unique_users = ratings_df['user_id'].unique()
unique_movies = ratings_df['movie_id'].unique()
genre_cols = ['genre_' + str(i) for i in range(19)]

# Create mappings from original IDs/names to a contiguous integer space
feature_counter = 0
feature_map = {} # Maps (feature_type, original_id) -> global_feature_index
field_map = {}   # Maps global_feature_index -> field_id (e.g., user, movie, genre)
field_counter = 0

# Add User Features
user_field_id = field_counter
field_counter += 1
for user_id in unique_users:
    feature_map[('user', user_id)] = feature_counter
    field_map[feature_counter] = user_field_id
    feature_counter += 1

# Add Movie Features
movie_field_id = field_counter
field_counter += 1
for movie_id in unique_movies:
    feature_map[('movie', movie_id)] = feature_counter
    field_map[feature_counter] = movie_field_id
    feature_counter += 1

# Add Genre Features
genre_field_id = field_counter
field_counter += 1
for genre_col in genre_cols:
    # We only need to map genres that actually exist (are "1" for some movie)
    # The genre columns in movies_df are already binary (0 or 1)
    feature_map[('genre', genre_col)] = feature_counter
    field_map[feature_counter] = genre_field_id
    feature_counter += 1

total_features = feature_counter
total_fields = field_counter

print(f"Total unique features (user, movie, genre): {total_features}")
print(f"Total fields: {total_fields}")

Total unique features (user, movie, genre): 2644
Total fields: 3


In [None]:
# --- 3. Prepare Training and Test Data for FM ---
# For each rating, we construct a sparse feature vector (indices of non-zero features)
# and its corresponding target rating.

# Split ratings into training and testing sets
train_ratings, test_ratings = train_test_split(ratings_df, test_size=0.2, random_state=42)

def prepare_fm_data(df, movies_df, feature_map, genre_cols):
    """
    Converts a DataFrame of ratings into FM-compatible sparse data.
    Returns:
        X_indices: List of lists, where each inner list contains the indices of active features for a sample.
        y: List of target ratings.
    """
    X_indices = []
    y = []
    for _, row in df.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        rating = row['rating']

        # Get the global feature index for user and movie
        user_feature_idx = feature_map[('user', user_id)]
        movie_feature_idx = feature_map[('movie', movie_id)]

        # Get genre feature indices for this movie
        movie_genres = movies_df[movies_df['movie_id'] == movie_id][genre_cols].iloc[0]
        genre_feature_indices = [feature_map[('genre', col)] for col, val in movie_genres.items() if val == 1]

        # Combine all active feature indices for this sample
        sample_indices = [user_feature_idx, movie_feature_idx] + genre_feature_indices
        X_indices.append(sample_indices)
        y.append(rating)
    return X_indices, np.array(y)

print("\nPreparing training data...")
X_train_indices, y_train = prepare_fm_data(train_ratings, movies_df, feature_map, genre_cols)
print("Preparing test data...")
X_test_indices, y_test = prepare_fm_data(test_ratings, movies_df, feature_map, genre_cols)

# --- 4. Factorization Machine Model Implementation ---

class FactorizationMachine:
    def __init__(self, num_features, k_factors, learning_rate=0.01, reg_w=0.01, reg_v=0.01):
        """
        Initializes the Factorization Machine model.
        Args:
            num_features (int): Total number of unique features.
            k_factors (int): Dimensionality of the latent factors.
            learning_rate (float): Learning rate for SGD.
            reg_w (float): Regularization strength for linear weights (w).
            reg_v (float): Regularization strength for latent vectors (V).
        """
        self.num_features = num_features
        self.k_factors = k_factors
        self.lr = learning_rate
        self.reg_w = reg_w
        self.reg_v = reg_v

        # Initialize parameters
        self.w0 = 0.0  # Global bias
        self.w = np.zeros(num_features) # Linear weights
        self.V = np.random.normal(0, 0.1, (num_features, k_factors)) # Latent factor matrix

    def predict(self, x_indices):
        """
        Calculates the prediction for a single sample (sparse representation).
        Args:
            x_indices (list): List of indices of active features (where x_i = 1).
        Returns:
            float: Predicted rating.
        """
        # Linear term: sum(wi * xi)
        linear_term = np.sum(self.w[idx] for idx in x_indices)

        # Interaction term: 0.5 * sum_f( (sum_i(V_if * xi))^2 - sum_i(V_if^2 * xi^2) )
        # Since xi is 1 for active features and 0 otherwise, this simplifies:
        interaction_term = 0.0
        for f in range(self.k_factors):
            # Sum_i(V_if * xi) becomes sum_i(V_if) for active features
            sum_Vif_xi = np.sum(self.V[idx, f] for idx in x_indices)
            # Sum_i(V_if^2 * xi^2) becomes sum_i(V_if^2) for active features
            sum_Vif_sq_xi_sq = np.sum(self.V[idx, f]**2 for idx in x_indices)
            interaction_term += (sum_Vif_xi**2 - sum_Vif_sq_xi_sq)
        interaction_term *= 0.5

        return self.w0 + linear_term + interaction_term

    def train(self, X_indices, y, n_epochs=50):
        """
        Trains the FM model using Stochastic Gradient Descent.
        Args:
            X_indices (list): List of lists, where each inner list contains indices of active features.
            y (np.array): Array of target ratings.
            n_epochs (int): Number of training epochs.
        """
        print(f"\nTraining Factorization Machine for {n_epochs} epochs...")
        for epoch in range(n_epochs):
            total_loss = 0.0
            for i, x_sample_indices in enumerate(X_indices):
                actual_rating = y[i]
                predicted_rating = self.predict(x_sample_indices)

                # Calculate error
                error = predicted_rating - actual_rating # We minimize (predicted - actual)^2

                # Update global bias (w0)
                self.w0 -= self.lr * error

                # Update linear weights (w_i)
                for idx in x_sample_indices:
                    self.w[idx] -= self.lr * (error + self.reg_w * self.w[idx]) # Add regularization

                # Update latent vectors (V_if)
                for f in range(self.k_factors):
                    # Cache sum_Vif_xi for this feature dimension 'f'
                    sum_Vif_xi = np.sum(self.V[idx, f] for idx in x_sample_indices)

                    for idx in x_sample_indices:
                        # Gradient for V_if: error * (sum_Vif_xi - V_if)
                        # We multiply by 1.0 here because x_i is 1 for active features
                        grad_Vif = error * (sum_Vif_xi - self.V[idx, f]) + self.reg_v * self.V[idx, f]
                        self.V[idx, f] -= self.lr * grad_Vif

                total_loss += error**2 # Sum of squared errors for MSE

            mse = total_loss / len(X_indices)
            rmse = sqrt(mse)
            print(f"Epoch {epoch+1}/{n_epochs}, Train RMSE: {rmse:.4f}")

# --- 5. Model Training and Evaluation ---

k_factors = 10 # Dimensionality of latent factors
fm_model = FactorizationMachine(total_features, k_factors, learning_rate=0.01, reg_w=0.01, reg_v=0.01)

# Train the model
fm_model.train(X_train_indices, y_train, n_epochs=50)

# Evaluate on the test set
print("\nEvaluating on test set...")
test_predictions = []
for x_sample_indices in X_test_indices:
    predicted_rating = fm_model.predict(x_sample_indices)
    # Clip predictions to be within the rating scale (1 to 5)
    predicted_rating = max(1, min(5, predicted_rating))
    test_predictions.append(predicted_rating)

final_rmse = sqrt(mean_squared_error(y_test, test_predictions))
print(f"Final Test RMSE: {final_rmse:.4f}")

# --- 6. Generate Recommendations for a Specific User ---

def get_top_n_recommendations(user_id_to_recommend, fm_model, movies_df, feature_map, genre_cols, n=10):
    """
    Generates top N movie recommendations for a given user.
    Args:
        user_id_to_recommend (int): The ID of the user for whom to generate recommendations.
        fm_model (FactorizationMachine): The trained FM model.
        movies_df (pd.DataFrame): DataFrame containing movie information.
        feature_map (dict): Mapping from (feature_type, original_id) to global_feature_index.
        genre_cols (list): List of genre column names.
        n (int): Number of top recommendations to return.
    Returns:
        list: A list of dictionaries, each containing movie_id, title, and predicted_rating.
    """
    # Get movies the user has already rated
    rated_movie_ids = ratings_df[ratings_df['user_id'] == user_id_to_recommend]['movie_id'].tolist()

    # Get all movie IDs
    all_movie_ids = movies_df['movie_id'].tolist()

    # Filter out movies the user has already rated
    unrated_movie_ids = [m_id for m_id in all_movie_ids if m_id not in rated_movie_ids]

    predicted_ratings_for_unrated = []

    # Get the user's global feature index once
    user_feature_idx = feature_map.get(('user', user_id_to_recommend))
    if user_feature_idx is None:
        print(f"Warning: User ID {user_id_to_recommend} not found in feature map.")
        return []

    for movie_id in unrated_movie_ids:
        # Construct the sparse feature vector for this (user, movie) pair
        # This involves the user_id, movie_id, and movie's genres

        movie_feature_idx = feature_map.get(('movie', movie_id))
        if movie_feature_idx is None:
            # Should not happen if all_movie_ids come from movies_df
            continue

        movie_genres = movies_df[movies_df['movie_id'] == movie_id][genre_cols].iloc[0]
        genre_feature_indices = [feature_map[('genre', col)] for col, val in movie_genres.items() if val == 1]

        sample_indices = [user_feature_idx, movie_feature_idx] + genre_feature_indices

        # Predict rating
        predicted_rating = fm_model.predict(sample_indices)
        predicted_rating = max(1, min(5, predicted_rating)) # Clip
        predicted_ratings_for_unrated.append((movie_id, predicted_rating))

    # Sort by predicted rating in descending order
    predicted_ratings_for_unrated.sort(key=lambda x: x[1], reverse=True)

    top_n_movies = []
    for movie_id, rating in predicted_ratings_for_unrated[:n]:
        movie_title = movies_df[movies_df['movie_id'] == movie_id]['title'].iloc[0]
        top_n_movies.append({'movie_id': movie_id, 'title': movie_title, 'predicted_rating': rating})

    return top_n_movies

# Example: Get top 5 recommendations for User ID 10
user_id_example = 10
recommendations = get_top_n_recommendations(user_id_example, fm_model, movies_df, feature_map, genre_cols, n=5)

print(f"\nTop 5 movie recommendations for User ID {user_id_example}:")
if recommendations:
    for rec in recommendations:
        print(f"  - {rec['title']} (Predicted Rating: {rec['predicted_rating']:.2f})")
else:
    print("  No recommendations found for this user (perhaps user not in dataset or no unrated movies).")


Preparing training data...
Preparing test data...

Training Factorization Machine for 50 epochs...


  linear_term = np.sum(self.w[idx] for idx in x_indices)
  sum_Vif_xi = np.sum(self.V[idx, f] for idx in x_indices)
  sum_Vif_sq_xi_sq = np.sum(self.V[idx, f]**2 for idx in x_indices)
  sum_Vif_xi = np.sum(self.V[idx, f] for idx in x_sample_indices)


Epoch 1/50, Train RMSE: 1.0149
Epoch 2/50, Train RMSE: 0.9505
Epoch 3/50, Train RMSE: 0.9303
Epoch 4/50, Train RMSE: 0.9164
