In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
ratings = pd.read_csv('ratings.csv').drop(['Unnamed: 0'], axis=1)
providers = pd.read_csv('providers.csv').drop(['Unnamed: 0'], axis=1)
reg = ratings['Registration number'].tolist()

In [9]:
def ttsplit(examples, labels, test_size=0.1, verbose=0):
    """
    Split examples and labels into train and test sets.

    Args:
        examples (numpy array): Input examples.
        labels (numpy array): Corresponding labels.
        test_size (float, optional): The proportion of the data to include in the test split. Defaults to 0.1.
        verbose (int, optional): Verbosity mode. If 1, prints information about the split. Defaults to 0.

    Returns:
        tuple: Tuple containing the train and test sets for examples and labels.
    """
    from sklearn.model_selection import train_test_split

    if verbose:
        print("Train/Test split")
        print(f"{100 - test_size * 100}% of training data")
        print(f"{test_size * 100}% of testing data")

    # Split data into train and test sets
    train_examples, test_examples, train_labels, test_labels = train_test_split(
        examples, labels, test_size=test_size, random_state=0, shuffle=True
    )

    # Transform train and test examples to their corresponding one-hot representations
    train_users = train_examples[:, 0]
    test_users = test_examples[:, 0]

    train_items = train_examples[:, 1]
    test_items = test_examples[:, 1]

    # Final training and test set
    x_train = np.array(list(zip(train_users, train_items)))
    x_test = np.array(list(zip(test_users, test_items)))

    y_train = train_labels
    y_test = test_labels

    if verbose:
        print()
        print('Number of training examples:', x_train.shape)
        print('Number of training labels:', y_train.shape)
        print('Number of test examples:', x_test.shape)
        print('Number of test labels:', y_test.shape)

    return (x_train, x_test), (y_train, y_test)



def calculate_mean_ratings(dataframe):
    """
    Calculate the mean ratings grouped by User_id.
    
    Args:
        dataframe (DataFrame): Input data.
        
    Returns:
        DataFrame: A DataFrame containing the mean ratings for each User_id.
    """
    mean_ratings = dataframe.groupby(by="User_id", as_index=False)["rating"].mean()
    return mean_ratings


def normalize_ratings(dataframe, norm_column="norm_rating"):
    """
    Normalize the user ratings relative to the overall mean.
    
    Args:
        dataframe (DataFrame): Input data.
        norm_column (str): Name of the column for normalized ratings.
        
    Returns:
        DataFrame: A DataFrame with the normalized ratings.
    """
    mean_ratings = calculate_mean_ratings(dataframe=dataframe)
    normalized_data = pd.merge(dataframe, mean_ratings, suffixes=("", "_mean"), on="User_id")
    normalized_data[f"{norm_column}"] = normalized_data["rating"] - normalized_data["rating_mean"]

    return normalized_data


def create_rating_matrix(dataframe, column):
    """
    Generate a rating matrix from the dataframe.
    
    Args:
        dataframe (DataFrame): Input data.
        column (str): Name of the column to use for the matrix.
        
    Returns:
        csr_matrix: A sparse rating matrix.
    """
    crosstab = pd.crosstab(
        dataframe["User_id"], dataframe["Registration number"], dataframe[f"{column}"], aggfunc=sum
    ).fillna(0).values()
    rating_matrix = csr_matrix(crosstab)
    return rating_matrix


def scale_ratings(dataframe, scaled_column="scaled_rating"):
    """
    Scale the ratings between 0 and 1.
    
    Args:
        dataframe (DataFrame): Input data.
        scaled_column (str): Name of the column for scaled ratings.
        
    Returns:
        DataFrame: A DataFrame with the scaled ratings.
    """
    dataframe[f"{scaled_column}"] = dataframe.rating / 5.0
    return dataframe


def get_examples_and_labels(dataframe, labels_column="rating"):
    """
    Get the input examples and corresponding labels from the dataframe.
    
    Args:
        dataframe (DataFrame): Input data.
        labels_column (str): Name of the column containing the labels.
        
    Returns:
        tuple: A tuple containing the examples and labels.
    """
    examples = dataframe[["User_id", "Registration number"]].values
    labels = dataframe[f"{labels_column}"].values
    return examples, labels


In [10]:
def ids_encoder(ratings):
    """
    Encode User_id and Registration number for easier processing.
    
    Args:
        ratings (DataFrame): Input ratings data.
        
    Returns:
        tuple: A tuple containing the encoded ratings, user encoder, and item encoder.
    """
    # Get unique users and items
    users = sorted(ratings['User_id'].unique())
    items = sorted(ratings['Registration number'].unique())

    # Encoder for users and items
    user_encoder = LabelEncoder()
    item_encoder = LabelEncoder()

    # Fit encoders to users and items
    user_encoder.fit(users)
    item_encoder.fit(items)

    # Rewrite User_id and Registration number with encoded values
    ratings['User_id'] = user_encoder.transform(ratings['User_id'].tolist())
    ratings['Registration number'] = item_encoder.transform(ratings['Registration number'].tolist())

    return ratings, user_encoder, item_encoder


## Non-negative Matrix Factorization

In [22]:
class NMF:
    def __init__(self, ratings, m, n, uencoder, iencoder, K=10, lambda_P=0.01, lambda_Q=0.01):
        """
        Initialize the NMF model with the given parameters.
        
        Args:
            ratings (DataFrame): Input ratings data.
            m (int): Number of users.
            n (int): Number of items.
            uencoder (LabelEncoder): User encoder.
            iencoder (LabelEncoder): Item encoder.
            K (int): Number of latent factors.
            lambda_P (float): Regularization parameter for P matrix.
            lambda_Q (float): Regularization parameter for Q matrix.
        """
        np.random.seed(32)
        
        # Initialize P and Q matrices with random values
        self.ratings = ratings
        self.np_ratings = ratings.to_numpy()
        self.K = K
        self.P = np.random.rand(m, K)
        self.Q = np.random.rand(n, K)
        
        # Set hyperparameters
        self.lambda_P = lambda_P
        self.lambda_Q = lambda_Q

        # Store user and item encoders
        self.uencoder = uencoder
        self.iencoder = iencoder
        
        # Dictionary to store training history
        self.history = {
            "epochs": [],
            "loss": [],
            "val_loss": [],
        }

    def print_training_parameters(self):
        """
        Print the training parameters.
        """
        print("Training NMF ...")
        print(f"k={self.K}")
        
    def mae(self, x_train, y_train):
        """
        Calculate the mean absolute error (MAE) on the training set.
        
        Args:
            x_train (array-like): Training examples.
            y_train (array-like): Training labels.
            
        Returns:
            float: The MAE value.
        """
        M = x_train.shape[0]
        error = 0
        for pair, r in zip(x_train, y_train):
            u, i = pair
            error += abs(r - np.dot(self.P[u], self.Q[i]))
        return error / M
    
    def update_rule(self, u, i, error):
        """
        Update the P and Q matrices based on the NMF update rule.
        
        Args:
            u (int): User index.
            i (int): Item index.
            error (float): Prediction error.
        """
        # Retrieve relevant indices and values from ratings data
        I = self.np_ratings[self.np_ratings[:, 0] == u][:, [1, 2]]
        U = self.np_ratings[self.np_ratings[:, 1] == i][:, [0, 2]]    
        
        # Calculate the numerator and denominator for updating P
        l = I[:, 0].astype(int)
        num = self.P[u] * np.dot(self.Q[l].T, I[:, 1])
        dem = np.dot(self.Q[l].T, np.dot(self.P[u], self.Q[l].T)) + self.lambda_P * len(I) * self.P[u]
        self.P[u] = num / dem
        
        # Calculate the numerator and denominator for updating Q
        m = U[:, 0].astype(int)
        num = self.Q[i] * np.dot(self.P[m].T, U[:, 1])
        dem = np.dot(self.P[m].T, np.dot(self.P[m], self.Q[i].T)) + self.lambda_Q * len(U) * self.Q[i]
        self.Q[i] = num / dem
    
    @staticmethod
    def print_training_progress(epoch, epochs, error, val_error, steps=5):
        """
        Print the training progress at certain steps.
        
        Args:
            epoch (int): Current epoch number.
            epochs (int): Total number of epochs.
            error (float): Training error.
            val_error (float): Validation error.
            steps (int): Number of steps to print progress.
        """
        if epoch == 1 or epoch % steps == 0:
            print(f"epoch {epoch}/{epochs} - loss: {round(error, 3)} - val_loss: {round(val_error, 3)}")
                   
    def fit(self, x_train, y_train, validation_data, epochs=10):
        """
        Train the NMF model using the given training data.
        
        Args:
            x_train (array-like): Training examples.
            y_train (array-like): Training labels.
            validation_data (tuple): Tuple containing validation examples and labels.
            epochs (int): Number of training epochs.
            
        Returns:
            dict: Dictionary containing training history.
        """
        self.print_training_parameters()
        x_test, y_test = validation_data
        for epoch in range(1, epochs + 1):
            for pair, r in zip(x_train, y_train):
                u, i = pair
                r_hat = np.dot(self.P[u], self.Q[i])
                e = abs(r - r_hat)
                self.update_rule(u, i, e)                
            # Training and validation evaluation
            error = self.mae(x_train, y_train)
            val_error = self.mae(x_test, y_test)
            self.update_history(epoch, error, val_error)
            self.print_training_progress(epoch, epochs, error, val_error, steps=1)
        
        return self.history
    
    def update_history(self, epoch, error, val_error):
        """
        Update the training history with the current epoch and error values.
        
        Args:
            epoch (int): Current epoch number.
            error (float): Training error.
            val_error (float): Validation error.
        """
        self.history['epochs'].append(epoch)
        self.history['loss'].append(error)
        self.history['val_loss'].append(val_error)
    
    def evaluate(self, x_test, y_test):        
        """
        Evaluate the NMF model on the given test data.
        
        Args:
            x_test (array-like): Test examples.
            y_test (array-like): Test labels.
            
        Returns:
            float: The mean absolute error (MAE) on the test data.
        """
        error = self.mae(x_test, y_test)
        print(f"validation error: {round(error, 3)}")
        print('MAE:', error)        
        return error
      
    def predict(self, userid, itemid):
        """
        Predict the rating for a given user and item.
        
        Args:
            userid (int): User ID.
            itemid (int): Item ID.
            
        Returns:
            float: The predicted rating.
        """
        u = self.uencoder.transform([userid])[0]
        i = self.iencoder.transform([itemid])[0]
        
        # Calculate the rating prediction
        r = np.dot(self.P[u], self.Q[i])
        return r


## Fit

In [23]:
## initial rating
m = ratings['User_id'].nunique()   # всего пользователей
n = ratings['Registration number'].nunique()   # всего элементов

ratings, uencoder, iencoder = ids_encoder(ratings)

# получение данных в подготовленном виде
raw_examples, raw_labels = get_examples_and_labels(ratings)

# train test split
(x_train, x_test), (y_train, y_test) = ttsplit(examples=raw_examples, labels=raw_labels)

m = ratings['User_id'].nunique()  
n = ratings['Registration number'].nunique() 

# обучаем
nmf = NMF(ratings, m, n, uencoder, iencoder, K=10, lambda_P=0.6, lambda_Q=0.6)
history = nmf.fit(x_train, y_train, epochs=15, validation_data=(x_test, y_test))

Training NMF ...
k=10
epoch 1/15 - loss: 0.635 - val_loss: 0.854
epoch 2/15 - loss: nan - val_loss: 0.802
epoch 3/15 - loss: nan - val_loss: 0.79
epoch 4/15 - loss: nan - val_loss: 0.782
epoch 5/15 - loss: nan - val_loss: 0.775
epoch 6/15 - loss: nan - val_loss: 0.768
epoch 7/15 - loss: nan - val_loss: 0.763
epoch 8/15 - loss: nan - val_loss: 0.759
epoch 9/15 - loss: nan - val_loss: 0.755
epoch 10/15 - loss: nan - val_loss: 0.752
epoch 11/15 - loss: nan - val_loss: 0.749
epoch 12/15 - loss: nan - val_loss: 0.746
epoch 13/15 - loss: nan - val_loss: 0.744
epoch 14/15 - loss: nan - val_loss: 0.741
epoch 15/15 - loss: nan - val_loss: 0.739


In [24]:
nmf.evaluate(x_test, y_test)

validation error: 0.739
MAE: 0.7391805920159994


0.7391805920159994

## Predict

In [25]:
def user2userPredictions(userid, pred_path, reg):
    """
    Generate predictions for each user and save them to the file predictionNMF.csv
    
    Args:
        userid (int): User ID.
        pred_path (str): Path to save the predictions.
        reg (dict): Dictionary mapping registration numbers to some values.
    """
    # Find registered numbers
    reg_num = set(ratings['Registration number'].tolist())
    user = set(ratings[ratings['User_id'] == userid]['Registration number'].tolist())
    diff = list(reg_num - user)
    
    try:
        # Iterate over selected users for prediction
        for itemid in diff:
            # Make prediction for the user on items
            r_hat = nmf.predict(userid, itemid)
            # Save the prediction
            with open(pred_path, 'a+') as file:
                line = '{},{},{}\n'.format(userid, reg[itemid], r_hat)
                file.write(line)
    except IndexError:
        pass

import sys
import os

def user2userNMF(reg):
    """
    Perform predictions for all users, including users with only 1 rating.
    
    Args:
        reg (dict): Dictionary mapping registration numbers to some values.
    """
    # List of all users
    users = ratings['User_id'].unique()
    
    def _progress(count):
        sys.stdout.write('\rRating predictions. Progress status: %.1f%%' % (float(count/len(users))*100.0))
        sys.stdout.flush()
    
    saved_predictions = 'predictionsNMF.csv'    
    if os.path.exists(saved_predictions):
        os.remove(saved_predictions)
    
    for count, userid in enumerate(users):
        # Make prediction
        user2userPredictions(userid, saved_predictions, reg)
        _progress(count)

def user2userRecommendation(userid, N):
    """
    Generate predictions for a specific user.
    
    Args:
        userid (int): User ID.
        N (int): Number of recommendations to return.
    
    Returns:
        pandas DataFrame: Top N recommendations for the user.
    """
    
    saved_predictions = 'predictionsNMF.csv'
    
    predictions = pd.read_csv(saved_predictions, sep=',', names=['User_id', 'Registration number', 'predicted_rating'])
    predictions = predictions[predictions['User_id'] == userid]
    top_n_list = predictions.sort_values(by=['predicted_rating'], ascending=False)[:N]
    
    top_n_list = pd.merge(top_n_list, providers, on='Registration number', how='inner')
    
    return top_n_list


In [26]:
ratings.userid = uencoder.inverse_transform(ratings['User_id'].to_list())
ratings.itemid = iencoder.inverse_transform(ratings['Registration number'].to_list())
reg_num = set(ratings['Registration number'].tolist())
user = set(ratings[ratings['User_id'] == 0]['Registration number'].tolist())
diff = list(reg_num - user)

In [27]:
NMF.predict(nmf, userid=0, itemid=45)

0.4912798633002122

In [28]:
user2userNMF(reg)

Rating predictions. Progress status: 98.0%

In [33]:
k = user2userRecommendation(0, 5).drop(['Registration number', 'Предмет поставки', 'Важная информация'], axis=1)
# k[k['Сводный индикатор'] == 'Низкий риск']['Наименование'].tolist()
k['Наименование'].tolist()
# k

['СИБИРСКИЙ ДИСТРИБЬЮТОР, ООО',
 'ЭКО-ПИК, ООО',
 'АМУР-ХЭ, ООО',
 'ПИРОГОВСКОЕ, ООО МК',
 'АПЭК, ООО']