### Matrix Multiplication

In [1]:
import numpy as np

In [2]:
m1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
m2 = np.array([[2, 4, 6], [1, 3, 5], [1, 2, 3]])
m3 = np.zeros((3, 3))

In [3]:
m1.dot(m2)

array([[  7,  16,  25],
       [ 19,  43,  67],
       [ 31,  70, 109]])

In [4]:
np.matmul(m1, m2)

array([[  7,  16,  25],
       [ 19,  43,  67],
       [ 31,  70, 109]])

In [5]:
def matmul(m1, m2):
    row_m1 = m1.shape[0]
    col_m2 = m2.shape[1]
    col_m1 = row_m2 = m2.shape[0]
    m3 = np.zeros((row_m1, col_m2))
    # iterate through rows of M1
    for i in range(row_m1):
    
        # iterate through columns of M2
        for j in range(col_m2):
        
            # iterate through cols of M1 and rows of M2 
            for k in range(m2.shape[0]):
                m3[i, j] += m1[i, k] * m2[k, j]
    return m3

In [6]:
matmul(m1, m2)

array([[  7.,  16.,  25.],
       [ 19.,  43.,  67.],
       [ 31.,  70., 109.]])

### numpy vectorize

In [7]:
def myfunc(a, b):
    "Return a-b if a>b, otherwise return a+b"
    if a > b:
        return a - b
    else:
        return a + b

In [8]:
vectorize_myfunc = np.vectorize(myfunc)
vectorize_myfunc([1, 2, 3, 4], 2)

array([3, 4, 1, 2])

### numpy min, argmin

In [9]:
a = np.array([[10 , 4,  2],
              [1,  6,  3]])

In [10]:
# min()
print(a.min())
stored = a[0, 0]
for row in a:
    for item in row:
        if item < stored:
            stored = item
print(stored)

1
1


In [11]:
# argmin()
np.argmin(a, axis=1)

array([2, 0], dtype=int64)

In [12]:
# argmin()
min_list = []
for row in a:
    stored = row[0]
    min_index = index = 0
    for j in row:
        if j < stored:
            stored = j
            min_index = index
        index += 1
    min_list.append(min_index)
min_list

[2, 0]

### Check Version

In [1]:
import numpy as np
import sys
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler

print(sys.version)

3.11.3 | packaged by Anaconda, Inc. | (main, Apr 19 2023, 23:46:34) [MSC v.1916 64 bit (AMD64)]


In [2]:
!python --version

Python 3.11.3


In [3]:
print('numpy: ', np.__version__)

numpy:  1.24.3


### Sklearn RobustScaler for data with outliers

In [20]:
X_train = np.array([[1, 2, 3], [4, 5, 6], [3, 6, 8], [7, 999, 999]])

In [23]:
scaler = RobustScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled.round(2)

array([[-1.11, -0.01, -0.02],
       [ 0.22, -0.  , -0.  ],
       [-0.22,  0.  ,  0.  ],
       [ 1.56,  3.97,  3.96]])

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled.round(2)

array([[-1.27, -0.58, -0.58],
       [ 0.12, -0.58, -0.58],
       [-0.35, -0.57, -0.57],
       [ 1.5 ,  1.73,  1.73]])

In [28]:
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled.round(2)

array([[0.14, 0.  , 0.  ],
       [0.57, 0.01, 0.01],
       [0.43, 0.01, 0.01],
       [1.  , 1.  , 1.  ]])

In [None]:
Intuiton:

Some neural network classifier's probabilistic output values for three labels:
        
                Label1    Label2    Label3
Output_Unit_1     0.98       0.02      0
Output_Unit_2     0.33       0.33      0.33
Output_Unit_3     0.25       0.50      0.25
Output_Unit_4     0.05       0.45      0.499

How would you extract the 'softmax' output for most probable predicted label? - Use argmax.

In [2]:
arr = np.array([
    [0.98, 0.02, 0],
    [0.33, 0.33, 0.33],
    [0.25, 0.50, 0.25],
    [0.05, 0.45, 0.499]
])

In [3]:
np.argmax(arr, axis=1)

array([0, 0, 1, 2], dtype=int64)

In [4]:
class_labels = {0: 'Label1', 1: 'Label2', 2: 'Label3'}

In [8]:
list(map(lambda x: class_labels[x], np.argmax(arr, axis=1)))

['Label1', 'Label1', 'Label2', 'Label3']

### numpy.linalg.solve

In [9]:
A = np.array([[1,2],[3,4]]) 
B = np.array([[10],[20]])

In [10]:
A

array([[1, 2],
       [3, 4]])

In [11]:
B

array([[10],
       [20]])

In [13]:
res = np.linalg.solve(A, B)
res

array([[0.],
       [5.]])

In [14]:
np.matmul(A, res)

array([[10.],
       [20.]])

### Item-item collaborative filtering
Explanation on how it works from:
https://www.geeksforgeeks.org/item-to-item-based-collaborative-filtering/  => no code in here

Formula for Cosine Similarity:

$$Similarity(\vec A, \vec B) = \frac{\vec A \cdot \vec B}{||\vec A||*||\vec B||}$$

Prediction Computation: The second stage involves executing a recommendation system. It uses the items (already rated by the user) that are most similar to the missing item to generate rating. We hence try to generate predictions based on the ratings of similar products. We compute this using a formula which computes rating for a particular item using weighted sum of the ratings of the other similar products.

$$rating(U, I_i) = \frac {\sum_{j}rating(U, I_j)*s_{ij}}{\sum_{j}s_{ij}}$$ 

In [28]:
import numpy as np
import pandas as pd

In [29]:
df = pd.DataFrame({'Item_1': [2, 5, 3, np.nan],
                   'Item_2': [np.nan, 2, 3, 2],
                   'Item_3': [3, np.nan, 1, 2]}, index=['User_1', 'User_2', 'User_3', 'User_4'])
df

Unnamed: 0,Item_1,Item_2,Item_3
User_1,2.0,,3.0
User_2,5.0,2.0,
User_3,3.0,3.0,1.0
User_4,,2.0,2.0


Step 1: Finding similarities of all the item pairs.

Form the item pairs. For example in this example the item pairs are 
* (Item_1, Item_2)
* (Item_1, Item_3)
* (Item_2, Item_3)

Select each item to pair one by one. After this, we find all the users who have rated for both the items in the item pair. Form a vector for each item and calculate the similarity between the two items using the cosine formula stated above.

* Sim(Item1, Item2)
In the table, we can see only User_2 and User_3 have rated for both items 1 and 2. 
Thus, let I1 be vector for Item_1 and I2 be for Item_2.

$$Similarity(Item1, Item2) = \frac{(5 * 2) + (3 * 3)} {\sqrt{5^{2} + 3^{2}}{\sqrt{2^{2} + 3^{2}}}} = 0.90$$

* Sim(Item2, Item3)
$$Similarity(Item2, Item3) = \frac{(3 * 1) + (2 * 2)} {\sqrt{3^{2} + 2^{2}}{\sqrt{1^{2} + 2^{2}}}} = 0.868$$

Step 2: Generating the missing ratings in the table

Now, in this step we calculate the ratings that are missing in the table.

Rating of Item_2 for User_1:

$$R(U1, I2) = \frac{(2 * 0.90) + (3 * 0.868)}{(0.90 + 0.868)} = 2.49$$

In [30]:
def cosine_similarity(A, B):
    return np.dot(A, B) / (np.linalg.norm(A) * np.linalg.norm(B))

In [31]:
# find pair elements in both col1 (str - column name) and col2 (str) that have ratings (not np.nan)
def item_overlap(col1, col2):
    a = []; b = []
    for i, vals in enumerate(zip(df.loc[:, col1], df.loc[:, col2])):
        if (not np.isnan(vals[0])) and (not np.isnan(vals[1])):
            a.append(vals[0])
            b.append(vals[1])
    return np.array(a), np.array(b)  

In [32]:
from itertools import combinations

In [25]:
df.columns

Index(['Item_1', 'Item_2', 'Item_3'], dtype='object')

In [29]:
list(combinations(df.columns, 2))

[('Item_1', 'Item_2'), ('Item_1', 'Item_3'), ('Item_2', 'Item_3')]

In [33]:
# all pairwise combinations of items (df.columns)
pairwise_combinations = list(combinations(df.columns, 2))

cos_sims = {}
for tuple_ in pairwise_combinations:
    a, b = item_overlap(tuple_[0], tuple_[1])
    cos_sims[tuple_] = cosine_similarity(a, b)

In [34]:
cos_sims

{('Item_1', 'Item_2'): 0.9037378388935388,
 ('Item_1', 'Item_3'): 0.7893522173763263,
 ('Item_2', 'Item_3'): 0.8682431421244593}

In [35]:
col_labels = {0: 'Item_1', 1: 'Item_2', 2: 'Item_3'}
col_names = np.array(['Item_1', 'Item_2', 'Item_3'])

# predict rating (for nan values in the data)
# get row index and col index of nan values
idx_row, idx_col = np.where(pd.isnull(df))

for i, j in zip(idx_row, idx_col):
    col_to_predict = col_labels[j]
    based_on_cols = np.delete(col_names, j)
    
    # get cosine similarity among columns of interest
    rating_numerator = 0
    rating_denominator = 0
    for col in based_on_cols:
        pair = (col, col_to_predict)
        
        # sort tuple based on the item number, so we can find item-item similarity in cos_sims dictionary
        # cos_sims dictionary have keys - all pairwise combinations of items (columns), ex: ('Item_1', 'Item_2')
        pair_sorted = tuple(sorted(pair, key=lambda x: x.split('_')[1]))
        sim = cos_sims[pair_sorted]
        
        col_idx = np.argwhere(col_names == col)[0][0]
        rating_numerator += sim * df.iloc[i, col_idx]
        rating_denominator += sim
    #res = rating_numerator/rating_denominator
    df.iloc[i, j] = rating_numerator/rating_denominator

In [12]:
# df   # before running the above cell code

Unnamed: 0,Item_1,Item_2,Item_3
User_1,2.0,,3.0
User_2,5.0,2.0,
User_3,3.0,3.0,1.0
User_4,,2.0,2.0


In [37]:
df.round(2)   # after running the above cell code

Unnamed: 0,Item_1,Item_2,Item_3
User_1,2.0,2.49,3.0
User_2,5.0,2.0,3.43
User_3,3.0,3.0,1.0
User_4,2.0,2.0,2.0


### Sparse Matrix Operations
There exists various formats in which sparse matrices can be represented. Formats can be divided into two groups:

* Those that support efficient modification, such as DOK (Dictionary of keys), LIL (List of lists), or COO (Coordinate list). These are typically used to construct the matrices.
* Those that support efficient access and matrix operations, such as CSR (Compressed Sparse Row) or CSC (Compressed Sparse Column). The two types of CS (Compressed Sparse) Matrices:
    1. csr_matrix considers row first
    2. csc_matrix considers column first

Coordinate list (COO)

COO stores a list of (row, column, value) tuples. Ideally, the entries are sorted (by row index, then column index) to improve random access times. This is another format which is good for incremental matrix construction

'coo_matrix' is optimized to construct a sparse matrix. It is internally different than csr_matrix but when you simply print, it looks same.

Compressed sparse Row (CSR)

The compressed sparse row (CSR) or compressed row storage (CRS) format represents a matrix M by three (one-dimensional) arrays, that respectively contain nonzero values, the extents of rows, and column indices. This format allows fast row access and matrix-vector multiplications.

The CSR format stores a sparse m × n matrix M in row form using three (one-dimensional) arrays (A, IA, JA). Let NNZ denote the number of nonzero entries in M. (Note that zero-based indices shall be used here.)

In [2]:
mat = [[1, 0, 0],
       [5, 0, 2],
       [0, -1, 0],
       [0, 0, 3]]

In [None]:
csr_matrix gives the position of non-zero element in the row first then goes to second row, then to third and so on. 
for instance, csr_matrix(mat) returns:

(0, 0)  1.0 -- first row
(1, 0)  5.0 -- second row
(1, 2)  2.0 -- second row
(2, 1) -1.0 -- third row
(3, 2)  3.0 -- fourth row

Similarly csc_matrix gives the position of non-zero elements in the first column, then second column, and so on.

(0, 0)  1.0 -- first column
(1, 0)  5.0 -- first column
(2, 1) -1.0 -- second column
(1, 2)  2.0 -- third column
(3, 2)  3.0 -- third column

Both csr_matrix and csc_matrix are optimized for efficient access, matrix operations such as row/column slicing 
and vector operations.

In [3]:
from scipy.sparse import csr_matrix, csc_matrix

In [4]:
csr_matrix(mat)

<4x3 sparse matrix of type '<class 'numpy.intc'>'
	with 5 stored elements in Compressed Sparse Row format>

In [9]:
print(csr_matrix(mat))

  (0, 0)	1
  (1, 0)	5
  (1, 2)	2
  (2, 1)	-1
  (3, 2)	3


### Sparse data structures in Python
https://rushter.com/blog/scipy-sparse-matrices/

In [3]:
import numpy as np
from scipy.sparse import random
from scipy.sparse import csc_matrix

In [2]:
csc_matrix((3, 4), dtype=np.int8).toarray()

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]], dtype=int8)

In [4]:
# get size of a sparse matrix
def get_sparse_size(matrix):
    return int((matrix.data.nbytes + matrix.indptr.nbytes + matrix.indices.nbytes) / 1024.)

In [5]:
# create a sparse matrix of size 1000 x 100000
sparse_mat = random(10**3, 10**5, format='csr')

In [15]:
print(sparse_mat[:, 0])

  (116, 0)	0.07412506619622572
  (199, 0)	0.4292374404234104
  (326, 0)	0.6865147992157453
  (594, 0)	0.9992985987289552
  (604, 0)	0.8453082937057
  (633, 0)	0.2976305163871862
  (721, 0)	0.30951536700189275
  (765, 0)	0.6681337189168695
  (913, 0)	0.6540181790970657
  (915, 0)	0.19888838688622723
  (993, 0)	0.24604527550686672


In [21]:
sparse_size = get_sparse_size(sparse_mat)
print('The size of sparse_mat is %d KiB' % sparse_size)

The size of sparse_mat is 11722 KiB


In [20]:
# convert sparse_mat to a regular matrix and get its size
size = sparse_mat.toarray().nbytes / 1024.
print('The size of regular matrix is %d KiB' % size)

The size of regular matrix is 781250 KiB


In [27]:
print('Data compression ratio is %s' % np.round(size/sparse_size, 2), '%')

Data compression ratio is 66.65 %


In [57]:
np.random.seed(10)

# Generate a random sparse matrix
matrix = random(5, 5, format='csr', density=0.25)
matrix.toarray()

array([[0.        , 0.72175532, 0.        , 0.        , 0.        ],
       [0.29187607, 0.        , 0.        , 0.        , 0.        ],
       [0.71457578, 0.        , 0.54254437, 0.        , 0.        ],
       [0.        , 0.        , 0.91777412, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.14217005, 0.        ]])

In [58]:
matrix.data[:]

array([0.72175532, 0.29187607, 0.71457578, 0.54254437, 0.91777412,
       0.14217005])

In [59]:
matrix.data.shape

(6,)

In [60]:
# replace all non zero values with index number
matrix.data[:] = np.arange(1, matrix.data.shape[0]+1)

In [61]:
matrix.toarray()

array([[0., 1., 0., 0., 0.],
       [2., 0., 0., 0., 0.],
       [3., 0., 4., 0., 0.],
       [0., 0., 5., 0., 0.],
       [0., 0., 0., 6., 0.]])

In [62]:
matrix.data

array([1., 2., 3., 4., 5., 6.])

In [63]:
matrix.indices

array([1, 0, 0, 2, 2, 3])

In [64]:
matrix.indptr

array([0, 1, 2, 4, 5, 6])

In [67]:
# a simplified algorithm of item indexing
def get_item(row_index, column_index, matrix):
    # Get row values
    row_start = matrix.indptr[row_index]
    row_end = matrix.indptr[row_index + 1]
    row_values = matrix.data[row_start:row_end]

    # Get column indices of occupied values
    index_start = matrix.indptr[row_index]
    index_end = matrix.indptr[row_index + 1]

    # contains indices of occupied cells at a specific row
    row_indices = list(matrix.indices[index_start:index_end])

    # Find a positional index for a specific column index
    value_index = row_indices.index(column_index)

    if value_index >= 0:
        return row_values[value_index]
    else:
        # non-zero value is not found
        return 0

In [69]:
get_item(0, 1, matrix)

1.0

In [70]:
get_item(3, 2, matrix)

5.0