In [None]:
## Load Libraries
import pandas as pd
import numpy as np
import sys
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('seaborn-whitegrid')
%matplotlib inline

from PIL import Image

In [None]:
## Mount the Google Drive folder, if needed, for accessing data
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    # Change path below starting from /content/drive/MyDrive/Colab Notebooks/
    # depending on how data is organized inside your Colab Notebooks folder in
    # Google Drive
    DIR = '/content/drive/MyDrive/Colab Notebooks/MAHE/MSIS Coursework/OddSem2023MAHE'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

---

**The following user-defined function will be used for component-plotting vectors**

---

In [None]:
def plotveccomp(x, name = ' ', color = 'black', marker = '*', axis = None):
  ax = axis
  component_index = range(0, len(x))
  ax.plot(component_index, x, color = color, marker = marker)
  ax.plot(component_index, [np.mean(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='blue')
  ax.plot(component_index, [np.mean(x) - np.std(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='red')
  ax.plot(component_index, [np.mean(x) + np.std(x)]*len(x), linewidth = 1, linestyle = 'dashed', color ='red')
  ax.set_xlabel('Index')
  ax.set_ylabel('Value')
  ax.set_title('Component plot of '+name)

---

**Read hourly temperature data**

---

In [None]:
## Read hourly temperature data for multiple cities
FILE = DATA_DIR + 'temperature.csv'
df_temp = pd.read_csv(FILE, sep = ",", header = 0, skiprows = [1])
df_temp['datetime'] = pd.to_datetime(df_temp['datetime'], format='%Y-%m-%d %H:%M:%S')
df_temp = df_temp.set_index('datetime')
df_temp.head()

---

**Extract daily temperature vector for San Francisco**

---

In [None]:
# Temperature vector for San Francisco for 2012-10-02
t1 = df_temp.iloc[df_temp.index.get_loc('2012-10-02'), df_temp.columns.get_loc('San Francisco')].values
print(t1)

# Temperature vector for San Francisco for 2012-12-02
t2 = df_temp.iloc[df_temp.index.get_loc('2012-12-02'), df_temp.columns.get_loc('San Francisco')].values
print(t2)

# Plot temperature vectors for both days
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (10,6))
fig.tight_layout(pad = 4.0)
plotveccomp(t1, 'San Francisco Temperature 2012-10-02', 'black', '*', ax1)
plotveccomp(t2, 'San Francisco Temperature 2012-12-02', 'black', '*', ax2)

---

**Wikipedia articles as vectors**

---

In [None]:
## Install the wikipedia package
!pip install wikipedia

# Load the Wikipedia package
import wikipedia as wiki

---

**Extract Wikipedia content for 3 topics: Jungle Book, Harry Potter, Tarzan**

---

In [None]:
# Extract Wikipedia content for 3 topics
a = wiki.page('jungle book', auto_suggest = False)
b = wiki.page('Harry Potter', auto_suggest = False)
c = wiki.page('Tarzan', auto_suggest = False)
# Print number of words in each article
print(len(a.content.split()))
print(len(b.content.split()))
print(len(c.content.split()))

In [None]:
print(a.content)

---

**Import text-to-vector vectorizers**

---

In [None]:
## Import text-to-vector vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

---

**Vectorize the articles**

---

In [None]:
# Create a vectorizer object
cv = CountVectorizer()
tfidfv = TfidfVectorizer()
A = np.asarray(tfidfv.fit_transform([a.content, b.content, c.content]).todense())
#feature_words = cv.get_feature_names_out()
print(A.shape)
#print(feature_words)
print(A[0])

---
The patient data matrix:

![Patient dataset](https://bl3302files.storage.live.com/y4mTMCQdiTnIFj1IALg09CRz7pPWl0g4HpigAPbwyMmF0QNliGAgK3aEsBESo0BNFCy-0-kR6pllskO1DPVt2-76bYsQaACRWhkOebqJ545BbtWcGr1CJG72BZJPrYbQDWNAC0h1EHhpewBlORT_xtahEu-bite73OVi-4CzGeQf6GDw11H6kn72VocdC2bLAsJ?width=256&height=167&cropmode=none)

---

**Addition and subtraction of vectors, scalar multiplication (apply operation componentwise)**

![Vector addition](https://bl3302files.storage.live.com/y4mMlnDRWzIoNKWynOZFhzhFNDlReoFxf7XwSeFwNWW8f1lu5ssj_SvgMAEN9BWiQ2F-meER7rD2an2n2tfDoWffBHE8aD_WBsL0LAbHxnIpZtZu6hNJAvZ88m746S_ktA9-h-oo108AQjkXQHkYrgJ5AUCpvKB2dipeNG1VfIK_38Q8fsq6OKD43adplgy0H1k?width=200&height=80&cropmode=none)

![Vector subtraction](https://bl3302files.storage.live.com/y4mnQkNUONVVKJJ6dCEqV9lEuP360lE0yRumSIgl9LaQH_qBqjgI9wvUd64xJ-UNIjR7wJXZyaXZ_kf1_gAB9sXjMWaMxWhSnX6zcyvVtTrCDeO1MNWzj3A1YqI5YLALK-CGCSMurNV938QLH3C2u1-BE8_addFYSeO7DmCKz5TdWGf7qtC8M9rRN26RMqpk8iu?width=200&height=80&cropmode=none)

![Scalar-vector multiplication](https://bl3302files.storage.live.com/y4mYNwLMmuKRl3sNDSo0yyXYs0KFw1LBnQCU6nAgSawanlGNgLq7Bd93DQ0ojamRpGLx_PZvnsSG-6K-3TsdDctw5sm-QxnWUHSTJGalDR4JmUp27_Hf3ESAQukZ1Jk5G16ykO7H3AKmLSQxE4vVIAtMFbCnyxtsQEfpyb_SK5jIjVtjl7yoFcBDzsRDGzo5cZM?width=200&height=80&cropmode=none)

---

In [None]:
X = np.array(pd.DataFrame({'HR' : [76, 74, 72, 78],
                   'BP' : [126, 120, 118, 136],
                   'Temp': [38.0, 38.0, 37.5, 37.0]}))

print(X)

# Vector addition
print(X[1, :]+ X[2, :])

# Vector subtraction
print(X[1, :]- X[2, :])

# Scalar-vector multiplication
print((9/5)*X[:, 2]+32)

# Average patient
print((1/4)*(X[0, :] + X[1, :] + X[2, :] + X[3, :]))

In [None]:
# How are the elements of the patient data matrix structured?
print(X[0])
print(X[1])
print(X[2])
print(X[3])

---

**Load a color image as 3D tensor**

<img src = 'https://drive.google.com/uc?id=1-kENYuqShwKAVI3oqryeMA_aXemoXmIW'>

---

In [None]:
## Read image
FILENAME = DATA_DIR + 'lion.jpg'
# Read image
img = Image.open(FILENAME)
# Convert image to gray scale
imggray = img.convert('L')
# Convert image into a 3D numpy array (a.k.a. a matrix)
X = np.array(img)
print(X.shape)
# Convert image matrix into a vector
x = X.flatten()
print(x.shape)

---

A tensor of dimension 3 corresponding to 4 time stamps, 3 samples, 2 features (HR and BP)

---

In [None]:
# Create a tensor of dimesion 3
# 4 time stamps, 3 samples, 2 features
T = np.array([[[74, 128], [79, 116], [71, 116]],
              [[78, 118], [82, 124], [72, 128]],
              [[84, 138], [84, 130], [74, 120]],
              [[82, 126], [76, 156], [82, 132]]])
print(T.shape)
print(T)
print(T[0])
print(T[0][0])

---

Reshape tensor to represent (patients, timestamps, features). That is, the features become the last index.

---

In [None]:
T_reshaped = T.transpose(1, 0, 2)
print(T_reshaped)

---

$l_2$ norm or the geometric length of a vector denoted as $\lVert a\rVert$ tells us how long a vector is. In 2-dimensions, $\lVert a\rVert_2 = \sqrt{a_1^2+a_2^2}$ and in $n$-dimensions, $\lVert a\rVert_2 = \sqrt{a_1^2+a_2^2+\cdots+a_n^2}.$

---

In [None]:
a = np.array([1, 2, 3])
np.linalg.norm(a)

---

Dot product between two vectors is simply a pairwise-multiplication followed by a summation: for example, $a\cdot b = a_1\times b_2+a_2\times b_2+\cdots+a_n\times b_n.$

---

In [None]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.dot(a, b)

---

Cauchy-Schwarz inequality $-1\leq\frac{a\cdot b}{\lVert a\rVert\lVert b\rVert}\leq1.$

---

In [None]:
np.dot(a, b) / (np.linalg.norm(a)*np.linalg.norm(b))

---

**Calculate Euclidean and cosine dissimilarity between the Wikipedia articles**

---

In [None]:
# Lambda functions for calculating distances
d_E = lambda a, b: np.linalg.norm(a-b) # Euclidean distance
d_cos = lambda a, b: 1-(np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b)))

# Euclidean dissimilarity
print(d_E(A[0], A[1])) # between Jungle Book and Harry Potter
print(d_E(A[0], A[2])) # between Jungle Book and Tarzan

# Cosine dissimilarity
print(d_cos(A[0], A[1])) # between Jungle Book and Harry Potter
print(d_cos(A[0], A[2])) # between Jungle Book and Tarzan

---

Matrix-vector product is simply a sequence of dot products of the rows of matrix (seen as vectors) with the vector

---

In [None]:
A = np.array([[1,2,-1,-1], [2,4,-2,3], [-1,1,-2,4]])
x = np.array([-1, 1, 1, 0])
print(A)
print(A.shape)
print(A[0])
print(A[1])
print(A[2])
print(A[0].shape)
print(x)
print(np.dot(A, x))

---

Linear combination of columns of the matrix using the components of the vector is equivalent to the dot product of the rows of the matrix with the vector

----

In [None]:
print(A)
print(x)
# Dot product of rows of A with x
print(np.dot(A, x))
# Linear combination of columns of A using components of x
print(x[0]*A[:, 0] + x[1]*A[:, 1] + x[2]*A[:, 2] + x[3]*A[:, 3])
np.sum(x[range(4)] * A[:, range(4)], axis = 1) # faster

---

Matrix-matrix product using the patient data matrix:

![Patient dataset](https://bl3302files.storage.live.com/y4mTMCQdiTnIFj1IALg09CRz7pPWl0g4HpigAPbwyMmF0QNliGAgK3aEsBESo0BNFCy-0-kR6pllskO1DPVt2-76bYsQaACRWhkOebqJ545BbtWcGr1CJG72BZJPrYbQDWNAC0h1EHhpewBlORT_xtahEu-bite73OVi-4CzGeQf6GDw11H6kn72VocdC2bLAsJ?width=256&height=167&cropmode=none)

---

In [None]:
W = np.array([[0.8, 0.1], [0.1, 0.4], [0.1, 0.4]])
X = np.array(pd.DataFrame({'HR' : [76, 74, 72, 78],
                   'BP' : [126, 120, 118, 136],
                   'Temp': [38.0, 38.0, 37.5, 37.0]}))
print(X)
print(W)
print(np.dot(X, W))

---

Tensor-vector product is simply a sequence of matrix-vector products which in turn are a sequence of dot products of vectors

---

In [None]:
x = np.array([1, 0])
print(T)
print(x)
print(T.shape)
print(x.shape)
print(np.dot(T, x))