In [3]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
df_encoded=pd.read_csv('Final_Anime_Dataset.csv')
user_data_temp_final_path = "user_data_temp_final.csv"
OneUser_multiAnime_df = pd.read_csv(user_data_temp_final_path)

  df_encoded=pd.read_csv('Final_Anime_Dataset.csv')


In [4]:
OneUser_multiAnime_df.columns

Index(['Unnamed: 0', 'Username', 'Gender', 'Mean Score', 'Completed',
       'Birth_Year', 'user_id', 'anime_id', 'Anime Title', 'rating'],
      dtype='object')

In [None]:
df_encoded

In [None]:


class DataFrameMergerTransformer(BaseEstimator, TransformerMixin):
    """
    A scikit-learn transformer that merges a DataFrame with another reference DataFrame.
    
    Parameters:
    -----------
    df_encoded : pandas.DataFrame
        The main DataFrame containing all columns
    on : str or list of str
        Column(s) to join on
    how : str, default='inner'
        Type of merge to be performed ('inner', 'left', 'right', 'outer')
    suffixes : tuple of str, default=('_x', '_y')
        Suffixes to use for overlapping columns
    reset_index : bool, default=True
        Whether to reset the index of the resulting merged DataFrame
    drop_on_duplicate : bool or list, default=False
        Whether to drop duplicate columns after merging (except the join key)
        If list, specifies the columns to drop
    """
    
    def __init__(self, df_encoded, on, how='inner', suffixes=('_x', '_y'), 
                 reset_index=True, drop_on_duplicate=False):
        self.df_encoded = df_encoded
        self.on = on
        self.how = how
        self.suffixes = suffixes
        self.reset_index = reset_index
        self.drop_on_duplicate = drop_on_duplicate
    
    def fit(self, X, y=None):
        """Fit method (no actual fitting needed)"""
        return self
    
    def transform(self, X):
        """
        Merge the input DataFrame with the reference DataFrame.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            DataFrame to be merged with the reference DataFrame
            
        Returns:
        --------
        pandas.DataFrame
            Merged DataFrame
        """
        # Extract the required columns to create df_anime_genre
        df_anime_genre = self.df_encoded[[
            'anime_id', 'Genres', 'Genre_Action', 'Genre_Adventure', 'Genre_Avant Garde',
            'Genre_Award Winning', 'Genre_Boys Love', 'Genre_Comedy', 'Genre_Drama',
            'Genre_Ecchi', 'Genre_Erotica', 'Genre_Fantasy', 'Genre_Girls Love',
            'Genre_Gourmet', 'Genre_Hentai', 'Genre_Horror', 'Genre_Mystery',
            'Genre_Romance', 'Genre_Sci-Fi', 'Genre_Slice of Life', 'Genre_Sports',
            'Genre_Supernatural', 'Genre_Suspense'
        ]]
        
        # Perform the merge operation
        merged_df = pd.merge(
            X, 
            df_anime_genre,
            on=self.on,
            how=self.how,
            suffixes=self.suffixes
        )
        
        # Reset index if specified
        if self.reset_index:
            merged_df = merged_df.reset_index(drop=True)
        
        # Handle duplicate columns
        if self.drop_on_duplicate:
            if isinstance(self.drop_on_duplicate, list):
                # Drop specific columns
                merged_df = merged_df.drop(columns=self.drop_on_duplicate, errors='ignore')
            else:
                # Identify and drop columns with '_y' suffix (from reference_df)
                duplicate_cols = [col for col in merged_df.columns if col.endswith(self.suffixes[1])]
                merged_df = merged_df.drop(columns=duplicate_cols, errors='ignore')
        
        return merged_df

class UserGenreRatingAggregator(BaseEstimator, TransformerMixin):
    """
    A transformer that aggregates user ratings for different genres and creates a user profile
    with average ratings for each genre.
    
    Parameters:
    -----------
    genre_prefix : str, default='Genre_'
        Prefix to identify genre columns
    rating_col : str, default='rating'
        Column name containing ratings
    user_id_col : str, default='user_id'
        Column name containing user IDs
    user_info_cols : list or None, default=None
        Additional user columns to include in the result
    output_prefix : str, default='avg_'
        Prefix for the output average columns
    round_decimals : int or None, default=2
        Number of decimals to round to (None for no rounding)
    """
    
    def __init__(self, genre_prefix='Genre_', rating_col='rating', user_id_col='user_id',
                 user_info_cols=None, output_prefix='avg_', round_decimals=2):
        self.genre_prefix = genre_prefix
        self.rating_col = rating_col
        self.user_id_col = user_id_col
        self.user_info_cols = user_info_cols if user_info_cols is not None else [
            'Username', 'Gender', 'Birth_Year', 'Mean Score', 'Completed'
        ]
        self.output_prefix = output_prefix
        self.round_decimals = round_decimals
    
    def fit(self, X, y=None):
        """Fit method (no actual fitting needed)"""
        return self
    
    def transform(self, X):
        """
        Transform the input DataFrame to create user profiles with average genre ratings.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame with user ratings and genre information
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with one row per user and average ratings for each genre
        """
        # Make a copy to avoid modifying the original
        X_copy = X.copy()
        
        # Identify genre columns
        genre_cols = [col for col in X_copy.columns if col.startswith(self.genre_prefix)]
        
        # Calculate weighted ratings (genre * rating)
        rating_matrix = X_copy[genre_cols].multiply(X_copy[self.rating_col], axis=0)
        
        # Group by user_id and calculate sums
        rating_sums = rating_matrix.groupby(X_copy[self.user_id_col]).sum()
        genre_counts = X_copy.groupby(self.user_id_col)[genre_cols].sum()
        
        # Compute average rating per genre
        avg_genre_ratings = rating_sums.div(genre_counts)
        
        # Rename columns to use output prefix
        avg_genre_ratings.columns = [
            col.replace(self.genre_prefix, self.output_prefix) for col in avg_genre_ratings.columns
        ]
        
        # Get user info
        user_cols = [self.user_id_col] + [col for col in self.user_info_cols if col in X_copy.columns]
        user_info = X_copy[user_cols].drop_duplicates(self.user_id_col).set_index(self.user_id_col)
        
        # Combine results
        result_df = pd.concat([user_info, avg_genre_ratings], axis=1).reset_index()
        
        # Round if specified
        if self.round_decimals is not None:
            avg_cols = avg_genre_ratings.columns
            result_df[avg_cols] = result_df[avg_cols].round(self.round_decimals)
        
        return result_df

class GenreRatingTypeConverter(BaseEstimator, TransformerMixin):
    """
    A transformer that converts all columns with a specific prefix to numeric data types,
    handling non-numeric values by converting them to NaN.
    
    Parameters:
    -----------
    column_prefix : str, default='avg_'
        Prefix used to identify columns for conversion
    errors : str, default='coerce'
        How to handle errors in conversion:
        - 'ignore': leave invalid values as is
        - 'raise': raise an exception
        - 'coerce': convert invalid values to NaN
    downcast : str or None, default=None
        Type to downcast to if possible ('integer', 'signed', 'unsigned', 'float')
    """
    
    def __init__(self, column_prefix='avg_', errors='coerce', downcast=None):
        self.column_prefix = column_prefix
        self.errors = errors
        self.downcast = downcast
        self.columns_converted_ = None
    
    def fit(self, X, y=None):
        """
        Identify columns to convert based on prefix.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
        y : array-like, default=None
            Not used, present for API consistency
            
        Returns:
        --------
        self
        """
        # Identify columns starting with the specified prefix
        self.columns_converted_ = [col for col in X.columns if col.startswith(self.column_prefix)]
        return self
    
    def transform(self, X):
        """
        Convert identified columns to numeric types.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame to transform
            
        Returns:
        --------
        pandas.DataFrame
            Transformed DataFrame with numeric columns
        """
        X_result = X.copy()
        
        for column in self.columns_converted_:
            X_result[column] = pd.to_numeric(X_result[column], 
                                         errors=self.errors, 
                                         downcast=self.downcast)
        
        return X_result
    
    def get_feature_names_out(self, input_features=None):
        """
        Get output feature names.
        
        Parameters:
        -----------
        input_features : array-like of str or None, default=None
            Input features
            
        Returns:
        --------
        list
            List of converted column names
        """
        return self.columns_converted_

class GenderOneHotEncoder(BaseEstimator, TransformerMixin):
    """
    A transformer that one-hot encodes the 'Gender' column in a DataFrame.
    
    Parameters:
    -----------
    column_name : str, default='Gender'
        Name of the column to one-hot encode
    drop : str or None, default=None
        Drop strategy for encoder ('first', 'if_binary', or None)
    sparse_output : bool, default=False
        Whether to return a sparse matrix
    """
    
    def __init__(self, column_name='Gender', drop=None, sparse_output=False):
        self.column_name = column_name
        self.drop = drop
        self.sparse_output = sparse_output
        self.encoder = None
        self.feature_names_out_ = None
    
    def fit(self, X, y=None):
        """
        Fit the one-hot encoder on the specified column.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
        y : array-like, default=None
            Not used, present for API consistency
            
        Returns:
        --------
        self
        """
        # Initialize the encoder
        self.encoder = OneHotEncoder(sparse_output=self.sparse_output, drop=self.drop, dtype=int)
        
        # Fit the encoder if the column exists
        if self.column_name in X.columns:
            self.encoder.fit(X[[self.column_name]])
            self.feature_names_out_ = self.encoder.get_feature_names_out([self.column_name])
        else:
            self.feature_names_out_ = []
        
        return self
    
    def transform(self, X):
        """
        Transform the DataFrame by one-hot encoding the specified column.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
            
        Returns:
        --------
        pandas.DataFrame
            Transformed DataFrame with one-hot encoded column
        """
        X_result = X.copy()
        
        # Only transform if the column exists
        if self.column_name in X_result.columns:
            # Encode the column
            encoded_array = self.encoder.transform(X_result[[self.column_name]])
            
            # Convert to DataFrame with proper column names
            encoded_df = pd.DataFrame(
                encoded_array, 
                columns=self.feature_names_out_,
                index=X_result.index
            )
            
            # Concatenate with the original DataFrame
            X_result = pd.concat([X_result, encoded_df], axis=1)
            
            # Drop the original column
            X_result.drop(columns=[self.column_name], inplace=True)
        
        return X_result
    
    def get_feature_names_out(self, input_features=None):
        """
        Get output feature names.
        
        Parameters:
        -----------
        input_features : array-like of str or None, default=None
            Input features
            
        Returns:
        --------
        list
            List of feature names after transformation
        """
        return self.feature_names_out_

class UserAgeProcessor(BaseEstimator, TransformerMixin):
    """
    A transformer that processes age-related information in the user data:
    1. Calculates current age from Birth_Year
    2. Creates age group categories
    
    Parameters:
    -----------
    birth_year_col : str, default='Birth_Year'
        Column containing birth year
    current_year : int, default=2025
        Current year for age calculation
    add_age_groups : bool, default=True
        Whether to add age group categorization
    """
    
    def __init__(self, birth_year_col='Birth_Year', current_year=2025, add_age_groups=True):
        self.birth_year_col = birth_year_col
        self.current_year = current_year
        self.add_age_groups = add_age_groups
        self.age_col = 'Age'
        self.age_group_cols = [
            'Age_Group__Gen_Alpha', 'Age_Group__Zoomers',
            'Age_Group__Millennials', 'Age_Group__Gen_X',
            'Age_Group__Boomers_Plus'
        ]
    
    def fit(self, X, y=None):
        """Fit method (no actual fitting needed)"""
        return self
    
    def transform(self, X):
        """
        Transform the DataFrame by adding age and age group columns.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame with Birth_Year column
            
        Returns:
        --------
        pandas.DataFrame
            Transformed DataFrame with additional age columns
        """
        X_result = X.copy()
        
        # Calculate age if birth year column exists
        if self.birth_year_col in X_result.columns:
            X_result[self.age_col] = self.current_year - X_result[self.birth_year_col]
            
            # Add age groups if specified
            if self.add_age_groups:
                # Define age groups
                X_result['Age_Group__Gen_Alpha'] = (X_result[self.age_col] < 13).astype(int)
                X_result['Age_Group__Zoomers'] = ((X_result[self.age_col] >= 13) & 
                                                 (X_result[self.age_col] < 28)).astype(int)
                X_result['Age_Group__Millennials'] = ((X_result[self.age_col] >= 28) & 
                                                     (X_result[self.age_col] < 43)).astype(int)
                X_result['Age_Group__Gen_X'] = ((X_result[self.age_col] >= 43) & 
                                               (X_result[self.age_col] < 59)).astype(int)
                X_result['Age_Group__Boomers_Plus'] = (X_result[self.age_col] >= 59).astype(int)
            
            # Drop birth year if it was used
            X_result.drop(columns=[self.birth_year_col], inplace=True)
        
        return X_result
    
    def get_feature_names_out(self, input_features=None):
        """Get output feature names."""
        feature_names = [self.age_col]
        if self.add_age_groups:
            feature_names.extend(self.age_group_cols)
        return feature_names

class UserFeatureSelector(BaseEstimator, TransformerMixin):
    """
    A transformer that selects and reorders user features for the X_U matrix.
    
    Parameters:
    -----------
    features : list or None, default=None
        List of feature columns to select. If None, uses a default set.
    """
    
    def __init__(self, features=None):
        if features is None:
            self.features = [
                'Mean Score', 'Completed',
                'avg_Action', 'avg_Adventure', 'avg_Avant Garde', 'avg_Award Winning',
                'avg_Boys Love', 'avg_Comedy', 'avg_Drama', 'avg_Ecchi', 'avg_Erotica',
                'avg_Fantasy', 'avg_Girls Love', 'avg_Gourmet', 'avg_Hentai',
                'avg_Horror', 'avg_Mystery', 'avg_Romance', 'avg_Sci-Fi',
                'avg_Slice of Life', 'avg_Sports', 'avg_Supernatural', 'avg_Suspense',
                'Gender_Female', 'Gender_Male', 'Gender_Non-Binary',
                'Age', 'Age_Group__Gen_Alpha', 'Age_Group__Zoomers',
                'Age_Group__Millennials', 'Age_Group__Gen_X', 'Age_Group__Boomers_Plus'
            ]
        else:
            self.features = features
    
    def fit(self, X, y=None):
        """Fit method (no actual fitting needed)"""
        return self
    
    def transform(self, X):
        """
        Select and reorder features.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with selected features
        """
        # Get only columns that exist in the input
        available_features = [col for col in self.features if col in X.columns]
        
        # Return selected columns
        return X[available_features]
    
    def get_feature_names_out(self, input_features=None):
        """Get output feature names."""
        return self.features

class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    A transformer that imputes missing values in the user feature matrix.
    
    Parameters:
    -----------
    strategy : str, default='mean'
        Imputation strategy ('mean', 'median', 'most_frequent', 'constant')
    fill_value : any, default=None
        Used when strategy is 'constant'
    """
    
    def __init__(self, strategy='mean', fill_value=None):
        self.strategy = strategy
        self.fill_value = fill_value
        self.imputer = SimpleImputer(strategy=strategy, fill_value=fill_value)
        self.feature_names_in_ = None
    
    def fit(self, X, y=None):
        """
        Fit the imputer to the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
        y : array-like, default=None
            Not used, present for API consistency
            
        Returns:
        --------
        self
        """
        self.feature_names_in_ = X.columns
        self.imputer.fit(X)
        return self
    
    def transform(self, X):
        """
        Impute missing values in the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with imputed values
        """
        imputed_data = self.imputer.transform(X)
        return pd.DataFrame(imputed_data, columns=self.feature_names_in_, index=X.index)
    
    def get_feature_names_out(self, input_features=None):
        """Get output feature names."""
        return list(self.feature_names_in_)

class FeaturesStandardizer(BaseEstimator, TransformerMixin):
    """
    A transformer that standardizes features using StandardScaler.
    
    Parameters:
    -----------
    with_mean : bool, default=True
        Whether to center the data before scaling
    with_std : bool, default=True
        Whether to scale the data to unit variance
    """
    
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.scaler = StandardScaler(with_mean=with_mean, with_std=with_std)
        self.feature_names_in_ = None
    
    def fit(self, X, y=None):
        """
        Fit the scaler to the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
        y : array-like, default=None
            Not used, present for API consistency
            
        Returns:
        --------
        self
        """
        self.feature_names_in_ = X.columns
        self.scaler.fit(X)
        return self
    
    def transform(self, X):
        """
        Standardize the data.
        
        Parameters:
        -----------
        X : pandas.DataFrame
            Input DataFrame
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with standardized values
        """
        scaled_data = self.scaler.transform(X)
        return pd.DataFrame(scaled_data, columns=self.feature_names_in_, index=X.index)
    
    def get_feature_names_out(self, input_features=None):
        """Get output feature names."""
        return list(self.feature_names_in_)

def create_user_preprocessing_pipeline(df_encoded, user_ratings_df=None):
    """
    Creates a complete preprocessing pipeline for user features (X_U)
    
    Parameters:
    -----------
    df_encoded : pandas.DataFrame
        DataFrame containing anime information with genre encoding
    user_ratings_df : pandas.DataFrame or None, default=None
        DataFrame containing user ratings data
        If None, assumes the pipeline will be applied to already processed user data
        
    Returns:
    --------
    sklearn.pipeline.Pipeline
        Complete preprocessing pipeline for user features
    """
    if user_ratings_df is not None:
        # Pipeline for processing raw user ratings
        pipeline = Pipeline([
            ('merge_genres', DataFrameMergerTransformer(
                df_encoded=df_encoded,
                on='anime_id',
                how='inner',
                drop_on_duplicate=True
            )),
            ('aggregate_ratings', UserGenreRatingAggregator(
                genre_prefix='Genre_',
                rating_col='rating',
                user_id_col='user_id',
                user_info_cols=['Username', 'Gender', 'Birth_Year', 'Mean Score', 'Completed'],
                output_prefix='avg_',
                round_decimals=2
            )),
            ('convert_genre_types', GenreRatingTypeConverter(
                column_prefix='avg_',
                errors='coerce'
            )),
            ('encode_gender', GenderOneHotEncoder(
                column_name='Gender',
                sparse_output=False
            )),
            ('process_age', UserAgeProcessor(
                birth_year_col='Birth_Year',
                current_year=2025,
                add_age_groups=True
            )),
            ('select_features', UserFeatureSelector()),
            ('impute_missing', MissingValueImputer(
                strategy='mean'
            )),
            ('standardize', FeaturesStandardizer(
                with_mean=True,
                with_std=True
            ))
        ])
    else:
        # Pipeline for already processed user data
        pipeline = Pipeline([
            ('convert_genre_types', GenreRatingTypeConverter(
                column_prefix='avg_',
                errors='coerce'
            )),
            ('encode_gender', GenderOneHotEncoder(
                column_name='Gender',
                sparse_output=False
            )),
            ('process_age', UserAgeProcessor(
                birth_year_col='Birth_Year',
                current_year=2025,
                add_age_groups=True
            )),
            ('select_features', UserFeatureSelector()),
            ('impute_missing', MissingValueImputer(
                strategy='mean'
            )),
            ('standardize', FeaturesStandardizer(
                with_mean=True,
                with_std=True
            ))
        ])
    
    return pipeline

# Example usage
if __name__ == "__main__":
    # Load data
    df_encoded = pd.read_csv('Final_Anime_Dataset.csv')
    user_data_temp_final = pd.read_csv('user_data_temp_final.csv')
    
    # Create and apply pipeline for raw data
    full_pipeline = create_user_preprocessing_pipeline(df_encoded, user_data_temp_final)
    processed_user_features = full_pipeline.fit_transform(user_data_temp_final)
    print(f"Processed user features shape: {processed_user_features.shape}")
    
    # Alternatively, if user data is already processed
    user_data = pd.read_csv('Final_User_Dataset.csv')
    partial_pipeline = create_user_preprocessing_pipeline(df_encoded)
    final_user_features = partial_pipeline.fit_transform(user_data)
    print(f"Final user features shape: {final_user_features.shape}")
    
    # Convert to numpy array for model input if needed
    X_U = final_user_features.values
    print(f"X_U matrix shape: {X_U.shape}")

  df_encoded=pd.read_csv('Final_Anime_Dataset.csv')
  df_encoded = pd.read_csv('Final_Anime_Dataset.csv')


Processed user features shape: (71278, 32)
Final user features shape: (71278, 32)
X_U matrix shape: (71278, 32)


In [2]:
X_U

array([[-0.7537492 ,  0.28942198, -0.40323402, ..., -1.44209169,
         1.4991455 , -0.10613655],
       [-0.00807431, -0.25010196, -0.04174073, ..., -1.44209169,
         1.4991455 , -0.10613655],
       [-0.58083908, -0.04050273, -0.60524498, ...,  0.69343719,
        -0.66704666, -0.10613655],
       ...,
       [ 0.14322205,  1.53537294,  0.213431  , ...,  0.69343719,
        -0.66704666, -0.10613655],
       [-0.48357714, -0.3160869 , -1.15811707, ...,  0.69343719,
        -0.66704666, -0.10613655],
       [-1.15360385,  0.08758568, -1.46644958, ...,  0.69343719,
        -0.66704666, -0.10613655]])

In [5]:
X_U.shape

(71278, 32)