<a href="https://colab.research.google.com/github/Rezarsa82/online_retai-_matrix_monitoring/blob/main/matrix_monitoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
import pandas as pd
import zipfile
import requests
from io import BytesIO

url = "https://archive.ics.uci.edu/static/public/352/online+retail.zip"

r = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(r.content))

print(zip_file.namelist())  # ['Online Retail.xlsx']

excel_file = zip_file.open("Online Retail.xlsx")
df = pd.read_excel(excel_file)

print(df.columns)
print(df.head())


['Online Retail.xlsx']
Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

          InvoiceDate  UnitPrice  CustomerID         Country  
0 2010-12-01 08:26:00       2.55     17850.0  United Kingdom  
1 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
2 2010-12-01 08:26:00       2.75     17850.0  United Kingdom  
3 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  
4 2010-12-01 08:26:00       3.39     17850.0  United Kingdom  


In [41]:
df = df.dropna()
print(df.isna().sum())

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64
InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


In [55]:
df = df[df["Quantity"] > 0]
print((df["Quantity"] <= 0).sum())
num_cancelled = df[df["InvoiceNo"].astype(str).str.startswith("C")].shape[0]
print("تعداد فاکتورهای بازگشتی:", num_cancelled)
df_sorted = df.sort_values('InvoiceDate')

0
تعداد فاکتورهای بازگشتی: 0


In [56]:
item_transaction_matrix = df_sorted.groupby(['InvoiceNo', 'StockCode'])['Quantity'].sum().unstack().fillna(0)

item_transaction_matrix[item_transaction_matrix > 0] = 1

print(item_transaction_matrix.head())

StockCode  10002  10080  10120  10125  10133  10135  11001  15030  15034  \
InvoiceNo                                                                  
536365       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536366       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536367       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536368       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536369       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

StockCode  15036  ...  90214V  90214W  90214Y  90214Z  BANK CHARGES   C2  DOT  \
InvoiceNo         ...                                                           
536365       0.0  ...     0.0     0.0     0.0     0.0           0.0  0.0  0.0   
536366       0.0  ...     0.0     0.0     0.0     0.0           0.0  0.0  0.0   
536367       0.0  ...     0.0     0.0     0.0     0.0           0.0  0.0  0.0   
536368       0.0  ...     0.0     0.0     0.0     0.0         

In [57]:
import random

random_invoices = random.sample(list(item_transaction_matrix.index), 5)

for invoice in random_invoices:
    purchased_items = item_transaction_matrix.loc[invoice][item_transaction_matrix.loc[invoice] == 1]
    print(f"فاکتور {invoice}: {len(purchased_items)} کالا خریداری شده")
    if len(purchased_items) > 0:
        print(f"   نمونه کالاها: {list(purchased_items.index[:3])}")

فاکتور 568919: 32 کالا خریداری شده
   نمونه کالاها: [20725, 20726, 20728]
فاکتور 561800: 8 کالا خریداری شده
   نمونه کالاها: [21928, 21929, 22411]
فاکتور 578015: 10 کالا خریداری شده
   نمونه کالاها: [22086, 22573, 22574]
فاکتور 547516: 2 کالا خریداری شده
   نمونه کالاها: [22423, 22457]
فاکتور 544096: 7 کالا خریداری شده
   نمونه کالاها: [21586, 21621, 21878]


In [60]:
batch_size = 1000
batches = []

for i in range(0, len(item_transaction_matrix), batch_size):
    batch_matrix = item_transaction_matrix.iloc[i:i+batch_size]
    batches.append(batch_matrix)

print(f"\nTotal Batches: {len(batches)}")
print(f"First Batch Shape: {batches[0]}")


Total Batches: 19
First Batch Shape: StockCode  10002  10080  10120  10125  10133  10135  11001  15030  15034  \
InvoiceNo                                                                  
536365       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536366       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536367       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536368       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
536369       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
...          ...    ...    ...    ...    ...    ...    ...    ...    ...   
538830       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
538831       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
538832       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
538836       0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
538839       0.0    0.0    0.0    0.0    0.0    0.

In [61]:
first_batch = batches[0]
first_customer = first_batch.iloc[0]
items_bought = first_customer[first_customer == 1].index.tolist()

print("دسته اول، نفر اول خریداری کرده کالاها:")
print(items_bought)

دسته اول، نفر اول خریداری کرده کالاها:
[21730, 22752, 71053, '84029E', '84029G', '84406B', '85123A']


In [64]:
import numpy as np
import pandas as pd
from scipy import linalg

class RandomGaussianProjection:
    def __init__(self, n_components='auto', eps=0.1, random_state=None):
        self.n_components = n_components
        self.eps = eps
        self.random_state = random_state
        self.components_ = None
        self.n_components_ = None
        self.n_features_ = None

    def _calculate_n_components(self, n_features):
        if self.n_components == 'auto':
            if n_features <= 1:
                return 1
            n_components = int(4 * np.log(n_features) / (self.eps**2 / 2 - self.eps**3 / 3))
            self.n_components_ = max(1, min(n_components, n_features))
        else:
            self.n_components_ = min(self.n_components, n_features)
        return self.n_components_

    def _generate_random_matrix(self, n_features):
        rng = np.random.RandomState(self.random_state)

        n_components = self._calculate_n_components(n_features)
        self.components_ = rng.normal(
            loc=0,
            scale=1/np.sqrt(n_components),
            size=(n_components, n_features)
        )
        return self.components_

    def fit(self, X):
        if X is None or len(X) == 0:
            raise ValueError("X cannot be None or empty")

        if isinstance(X, pd.DataFrame):
            X = X.values
        elif not isinstance(X, np.ndarray):
            X = np.array(X)

        n_samples, self.n_features_ = X.shape

        if self.n_features_ <= 1:
            raise ValueError("X must have at least 2 features")

        self._generate_random_matrix(self.n_features_)
        return self

    def transform(self, X):
        if self.components_ is None:
            raise ValueError("Model must be fitted before transformation")

        if isinstance(X, pd.DataFrame):
            X = X.values
        elif not isinstance(X, np.ndarray):
            X = np.array(X)

        if X.shape[1] != self.components_.shape[1]:
            raise ValueError(f"Input has {X.shape[1]} features, but model was fitted with {self.components_.shape[1]} features")

        return X @ self.components_.T

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

    def inverse_transform(self, X_transformed):
        if self.components_ is None:
            raise ValueError("Model must be fitted before inverse transformation")

        components_pinv = linalg.pinv(self.components_)
        return X_transformed @ components_pinv.T

    def reconstruction_error(self, X):
        X_transformed = self.transform(X)
        X_reconstructed = self.inverse_transform(X_transformed)
        return np.mean((X - X_reconstructed) ** 2)

    def __repr__(self):
        return f"RandomGaussianProjection(n_components={self.n_components_})"

In [66]:
rgp = RandomGaussianProjection(n_components=3, random_state=42)
transformed = rgp.fit_transform(first_batch)
print(f"Success! Reduced from {first_batch.shape} to {transformed.shape}")
print(f"Components shape: {rgp.components_.shape}")
reconstructed = rgp.inverse_transform(transformed)
print(f"Reconstruction works: {reconstructed.shape == first_batch.shape}")

Success! Reduced from (1000, 3665) to (1000, 3)
Components shape: (3, 3665)
Reconstruction works: True


In [67]:
import numpy as np
from scipy import linalg
import warnings

class IncrementalPCA:
    def __init__(self, n_components=None, batch_size=None, copy=True, whiten=False):
        self.n_components = n_components
        self.batch_size = batch_size
        self.copy = copy
        self.whiten = whiten

        self.components_ = None
        self.explained_variance_ = None
        self.explained_variance_ratio_ = None
        self.singular_values_ = None
        self.mean_ = None
        self.var_ = None
        self.n_samples_seen_ = 0
        self.n_features_ = None

    def _validate_parameters(self, n_features):
        if self.n_components is None:
            self.n_components_ = n_features
        else:
            self.n_components_ = min(self.n_components, n_features)

        if self.batch_size is None:
            self.batch_size_ = 5 * self.n_components_
        else:
            self.batch_size_ = self.batch_size

        if self.n_components_ < 1:
            raise ValueError("n_components must be >= 1")

        if self.batch_size_ < 1:
            raise ValueError("batch_size must be >= 1")

    def _init_stats(self, n_features):
        self.n_features_ = n_features
        self.mean_ = np.zeros(n_features)
        self.var_ = np.zeros(n_features)
        self.n_samples_seen_ = 0

    def _update_stats(self, X):
        n_samples, n_features = X.shape

        if n_features != self.n_features_:
            raise ValueError(f"Expected {self.n_features_} features, got {n_features}")

        col_mean = np.mean(X, axis=0)
        col_var = np.var(X, axis=0)

        if self.n_samples_seen_ == 0:
            self.mean_ = col_mean
            self.var_ = col_var
        else:
            total_samples = self.n_samples_seen_ + n_samples

            delta = col_mean - self.mean_
            self.mean_ = (self.n_samples_seen_ * self.mean_ + n_samples * col_mean) / total_samples

            self.var_ = (
                (self.n_samples_seen_ * self.var_ +
                 n_samples * col_var +
                 self.n_samples_seen_ * n_samples * delta**2 / total_samples)
                / total_samples
            )

        self.n_samples_seen_ += n_samples

    def _initialize_components(self, X):
        n_samples, n_features = X.shape

        if n_samples < self.n_components_:
            raise ValueError(f"Need at least {self.n_components_} samples for initialization")

        X_centered = X - self.mean_
        U, S, Vt = linalg.svd(X_centered, full_matrices=False)

        self.components_ = Vt[:self.n_components_]
        self.singular_values_ = S[:self.n_components_]

        self.explained_variance_ = (S[:self.n_components_] ** 2) / (n_samples - 1)
        total_variance = np.sum(self.var_)
        self.explained_variance_ratio_ = self.explained_variance_ / total_variance

    def _incremental_update(self, X):
        n_samples, n_features = X.shape

        X_centered = X - self.mean_

        if self.components_ is None:
            self._initialize_components(X)
            return

        X_projected = X_centered @ self.components_.T

        X_residual = X_centered - X_projected @ self.components_

        if X_residual.shape[0] > 0:
            U_res, S_res, V_res_t = linalg.svd(X_residual, full_matrices=False)

            stacked = np.vstack([
                self.components_ * self.singular_values_[:, np.newaxis],
                V_res_t * S_res[:, np.newaxis]
            ])

            U_final, S_final, V_final_t = linalg.svd(stacked, full_matrices=False)

            self.components_ = V_final_t[:self.n_components_]
            self.singular_values_ = S_final[:self.n_components_]

            self.explained_variance_ = (self.singular_values_ ** 2) / (self.n_samples_seen_ - 1)
            total_variance = np.sum(self.var_)
            self.explained_variance_ratio_ = self.explained_variance_ / total_variance

    def partial_fit(self, X):
        X = self._validate_data(X)

        if self.n_features_ is None:
            self._init_stats(X.shape[1])

        self._validate_parameters(X.shape[1])
        self._update_stats(X)
        self._incremental_update(X)

        return self

    def fit(self, X):
        X = self._validate_data(X)

        if self.n_features_ is None:
            self._init_stats(X.shape[1])

        self._validate_parameters(X.shape[1])

        if len(X) <= self.batch_size_:
            self.partial_fit(X)
        else:
            for i in range(0, len(X), self.batch_size_):
                batch = X[i:i + self.batch_size_]
                self.partial_fit(batch)

        return self

    def transform(self, X):
        if self.components_ is None:
            raise ValueError("Model must be fitted before transformation")

        X = self._validate_data(X)
        X_centered = X - self.mean_
        X_transformed = X_centered @ self.components_.T

        if self.whiten:
            X_transformed /= np.sqrt(self.explained_variance_)

        return X_transformed

    def inverse_transform(self, X):
        if self.components_ is None:
            raise ValueError("Model must be fitted before inverse transformation")

        if self.whiten:
            X = X * np.sqrt(self.explained_variance_)

        X_original = X @ self.components_ + self.mean_
        return X_original

    def _validate_data(self, X):
        if isinstance(X, np.ndarray):
            if self.copy:
                X = X.copy()
        else:
            X = np.array(X)

        if len(X.shape) != 2:
            raise ValueError("X must be a 2D array")

        return X

    def reconstruction_error(self, X):
        X_transformed = self.transform(X)
        X_reconstructed = self.inverse_transform(X_transformed)
        return np.mean((X - X_reconstructed) ** 2)

    def get_covariance(self):
        if self.components_ is None:
            raise ValueError("Model must be fitted first")

        components = self.components_
        explained_variance = self.explained_variance_

        if self.whiten:
            components = components * np.sqrt(explained_variance[:, np.newaxis])

        return components.T @ np.diag(explained_variance) @ components

    def get_precision(self):
        covariance = self.get_covariance()
        return linalg.pinv(covariance)

    def __repr__(self):
        return (f"IncrementalPCA(n_components={self.n_components}, "
                f"batch_size={self.batch_size_}, "
                f"n_samples_seen={self.n_samples_seen_})")

In [70]:
import numpy as np
from scipy import linalg

class FrequentDirections:
    def __init__(self, n_components, verbose=False):
        self.n_components = n_components
        self.verbose = verbose
        self.sketch_ = None
        self.n_features_ = None
        self.n_samples_seen_ = 0

    def _initialize_sketch(self, n_features):
        self.sketch_ = np.zeros((self.n_components, n_features))
        self.n_features_ = n_features
        if self.verbose:
            print(f"Initialized sketch: {self.sketch_.shape}")

    def _svd_shrink(self, sketch):
        U, s, Vt = linalg.svd(sketch, full_matrices=False)

        if self.verbose:
            print(f"Singular values before shrink: {s}")

        if len(s) >= self.n_components:
            delta_squared = s[self.n_components-1] ** 2
            s_shrunk = np.sqrt(np.maximum(s**2 - delta_squared, 0))
        else:
            s_shrunk = s.copy()

        if self.verbose:
            print(f"Singular values after shrink: {s_shrunk}")

        sketch_shrunk = U @ np.diag(s_shrunk) @ Vt
        return sketch_shrunk

    def partial_fit(self, X):
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        n_batch, n_features = X.shape

        if self.sketch_ is None:
            self._initialize_sketch(n_features)
        elif n_features != self.n_features_:
            raise ValueError(f"Expected {self.n_features_} features, got {n_features}")

        self.n_samples_seen_ += n_batch

        if self.verbose:
            print(f"\nProcessing batch with {n_batch} samples")
            print(f"Total samples seen: {self.n_samples_seen_}")

        for i in range(n_batch):
            row = X[i:i+1]

            zero_row_indices = np.where(np.all(self.sketch_ == 0, axis=1))[0]

            if len(zero_row_indices) > 0:
                insert_idx = zero_row_indices[0]
                self.sketch_[insert_idx] = row
                if self.verbose:
                    print(f"Inserted row at position {insert_idx}")
            else:
                if self.verbose:
                    print("Sketch full - performing SVD shrink")

                self.sketch_ = self._svd_shrink(self.sketch_)
                self.sketch_[-1] = row
                if self.verbose:
                    print("Replaced last row with new data")

        return self

    def fit(self, X, batch_size=1000):
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        n_samples, n_features = X.shape

        if self.sketch_ is None:
            self._initialize_sketch(n_features)

        for i in range(0, n_samples, batch_size):
            batch = X[i:i+batch_size]
            self.partial_fit(batch)

        return self

    def get_covariance(self):
        if self.sketch_ is None:
            raise ValueError("Model must be fitted first")
        return self.sketch_.T @ self.sketch_

    def row_norms(self):
        if self.sketch_ is None:
            raise ValueError("Model must be fitted first")
        return np.sum(self.sketch_**2, axis=1)

    def transform(self, X):
        if self.sketch_ is None:
            raise ValueError("Model must be fitted first")

        if not isinstance(X, np.ndarray):
            X = np.array(X)

        sketch_pinv = linalg.pinv(self.sketch_)
        return X @ sketch_pinv.T

    def approximate_matrix(self):
        if self.sketch_ is None:
            raise ValueError("Model must be fitted first")

        U, s, Vt = linalg.svd(self.sketch_, full_matrices=False)
        rank = np.sum(s > 1e-10)
        return U[:, :rank] @ np.diag(s[:rank]) @ Vt[:rank, :]

    def reconstruction_error(self, X):
        X_approx = self.approximate_matrix()
        if X.shape[0] * X.shape[1] > 1e6:
            sample_size = min(1000, X.shape[0])
            indices = np.random.choice(X.shape[0], sample_size, replace=False)
            error = np.mean(np.sum((X[indices] - X_approx[indices])**2, axis=1))
        else:
            error = np.mean(np.sum((X - X_approx)**2, axis=1))
        return error

    def __repr__(self):
        return f"FrequentDirections(n_components={self.n_components}, samples_seen={self.n_samples_seen_})"