# Imports

In [1]:
import pandas as pd
import numpy as np

import scipy.linalg as la

from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Expectation

\begin{equation*}
EV = \Sigma P(X_i) \times X_i
\end{equation*}

Note: If we assume that the values an element of a feature can take are all equally likely, then the expected value is just the mean value.

In [2]:
def expectation(a):
    # prb: the probability of the outcome occurring
    prb = 1 / len(a)
     
    # calculating expectation overall
    sum = 0
    for i in range(0, len(a)):
        sum += (a[i] * prb)
         
    # returning expectation as sum
    return float(sum)

In [3]:
X = [1, 2, 3, 4, 5, 6]
print(expectation(X))

3.5


# Variance
\begin{equation*}
\sigma ^2 = \frac{\Sigma (x_i - \bar{x})^2}{n - 1}
\end{equation*}

In [4]:
def variance(data):
    # Number of observations
    n = len(data)
    # Mean of the data
    mean = sum(data) / n
    # Square deviations
    deviations = [(x - mean) ** 2 for x in data]
    # Variance of sample
    variance = sum(deviations) / (n - 1)
    # Variance of population (uncomment it if necessary)
    # variance = sum(deviations) / n
    return variance

In [5]:
X = [800,720,655,655,625,600,590,529,513,502,502,502]
print(variance(X))

9326.628787878786


# Covariance
\begin{equation*}
\text{Covariance(X, Y)} = \frac{\Sigma (X - \bar{X}) (Y - \bar{Y})}{n - 1}
\end{equation*}

In [6]:
X = [1692, 1978, 1884, 2151, 2519]
Y = [68, 102, 110, 112, 154]
cov = np.cov(X, Y)[0][1]
print(cov)

9107.3


# Pearson's Correlation
\begin{equation*}
\text{SD} = \sqrt{\frac{\Sigma (X - \bar{X})^2}{n - 1}}
\end{equation*}
\begin{equation*}
\text{Pearson's Correletion Coefficient} = \frac{Cov(x,y)}{SD(X) \times SD(Y)}
\end{equation*}

In [7]:
def pearson(a, b):
    cov_a_b = np.cov(a, b)[0][1]
    std_a = np.sqrt(variance(a))
    std_b = np.sqrt(variance(b))
    prs = cov_a_b / (std_a * std_b)
    return prs

In [8]:
X = [73, 52, 68, 47, 60, 71, 67, 80, 86, 91, 67, 73, 71, 57, 86, 76, 91, 69, 87, 77]
Y = [90, 74, 91, 62, 63, 78, 60, 89, 82, 105, 76, 82, 93, 73, 82, 88, 97, 80, 87, 95]
prs = pearson(X, Y)
print(prs)

0.7333715932635338


# Distances

In [9]:
pq = [[1, 4], [2, 5]]

### 1. Euclidean

<img src="euclidean.png" width=200>

\begin{equation*}
d(p,q) = \sqrt{\sum\limits_{i=1}^{n} (q_i - p_i)^2}
\end{equation*}

In [10]:
euclid = euclidean_distances(pq, pq)[0][1]
rounded_euclid = round(euclidean_distances(pq, pq)[0][1], 3)
print(rounded_euclid)

1.414


### 2. Manhattan

<img src="manhattan.png" width=200>

Definition: the sum of the distance between rows and the distance between columns

In [11]:
manh = manhattan_distances(pq, pq)[0][1]
rounded_manh = round(manhattan_distances(pq, pq)[0][1], 3)
print(rounded_manh)

2.0


# Eignevectors and Eigenvalues

\begin{equation*}
Av = \lambda v
\end{equation*}
<center> Where A is matrix, v is Eigenvector, and lambda is Eigenvalue </center>

#### Create matrix of shape 4 x 4

In [12]:
n = 4
P = np.random.randint(low=0, high=10, size=(n, n))
print(P)

[[9 6 0 7]
 [1 1 5 1]
 [4 7 9 2]
 [0 0 7 3]]


##### Create a symmetric matrix by multplying P with its transpose
\begin{align} \text{S (Symmetric Matrix)} = PP^T \end{align}

In [13]:
# @ here is used for matrix multiplication, and .T for transposing the matrix
S = P @ P.T
print(S)

[[166  22  92  21]
 [ 22  28  58  38]
 [ 92  58 150  69]
 [ 21  38  69  58]]


In [14]:
evals, evecs = la.eig(S)
print(evals)

[286.1235508 +0.j 100.08059251+0.j  15.39573255+0.j   0.40012414+0.j]


In [15]:
evals = evals.real
print(f"eigenvalues: \n {evals}")
print(f"eigenvectors: \n {evecs}")

eigenvalues: 
 [286.1235508  100.08059251  15.39573255   0.40012414]
eigenvectors: 
 [[-0.61924561 -0.74353251  0.2486993  -0.04292953]
 [-0.25018937  0.30745753  0.13961254 -0.90740481]
 [-0.67929117  0.34967506 -0.60937652  0.21201679]
 [-0.30414415  0.47995481  0.73980893  0.36030883]]


# Similarity Measures

In [16]:
x = "Julie loves me more than Linda loves me"
y = "Jane likes me more than Julie loves me"

### 1. Dice
<img src="dice.png" width="200">

In [17]:
def get_dice_sim(str1, str2):
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    return float(2 * len(c)) / (len(a) + len(b))

In [18]:
print(f"The Dice Similarity Measure is: {round(get_dice_sim(x,y), 4)}")

The Dice Similarity Measure is: 0.7692


### 2. Jaccard

<img src="jaccard.png" width="200">

In [19]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split())
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c)) 

In [20]:
print(f"The Jaccard Similarity Measure is: {round(get_jaccard_sim(x,y), 4)}")

The Jaccard Similarity Measure is: 0.625


### 3. Cosine

<img src="cosine.png" width="200">

Note: if you want to use Tf-IDF vectorizer, when you invoke the method, type "tfidf" in the third argument

In [21]:
def get_cosine_sim(str1, str2, count_type="count"):
    documents = [str1, str2]
    if count_type == "count":
        count_vectorizer = CountVectorizer()
        sparse_matrix = count_vectorizer.fit_transform(documents)
        doc_term_matrix = sparse_matrix.todense()
        df_count_vectorizer = pd.DataFrame(doc_term_matrix, 
                      columns=count_vectorizer.get_feature_names(), 
                      index=['doc_1', 'doc_2'])
        return cosine_similarity(df_count_vectorizer, df_count_vectorizer)
    elif count_type == "tfidf":
        tfidf_vectorizer = TfidfVectorizer()
        sparse_matrix = tfidf_vectorizer.fit_transform(documents)
        doc_term_matrix = sparse_matrix.todense()
        df_count_vectorizer = pd.DataFrame(doc_term_matrix, 
                      columns=tfidf_vectorizer.get_feature_names(), 
                      index=['doc_1', 'doc_2'])
    return cosine_similarity(tfidf_vectorizer, tfidf_vectorizer)

In [22]:
print(f"The Cosine Similarity Measure is: {round(get_cosine_sim(x, y, 'count')[0][1], 4)}")

The Cosine Similarity Measure is: 0.8216


# Gini Impurity Score

In [23]:
def gini_score(headers, classes):
    count = 0
    header_name = headers[0]
    final_calculation_print = f"Gini ({header_name}) = "
    final_calculation = 0
    total_instance_count = 0
    for x in range(len(classes)):
        total_instance_count += classes[count][3]
        count += 1

    count = 0
    for x in range(len(classes)):

        class_name = classes[count][0]
        yes_count = classes[count][1]
        no_count = classes[count][2]
        total_per_class_count = classes[count][3]
        yes_div_total_square = round((yes_count/total_per_class_count)**2, 4)
        no_div_total_square = round((no_count/total_per_class_count)**2, 4)
        calculation_result_per_class = 1 - yes_div_total_square - no_div_total_square

        print(f"Gini ({headers[0]} = {classes[count][0]})")
        print(f"1 - ({yes_count}/{total_per_class_count})^2 - ({no_count}/{total_per_class_count})^2 = 1 - {yes_div_total_square} - {no_div_total_square} = {round(calculation_result_per_class, 4)}")

        final_calculation_print += f"({total_per_class_count}/{total_instance_count}) * {round(calculation_result_per_class, 4)} + "
        final_calculation += (total_per_class_count/total_instance_count) * calculation_result_per_class
        count += 1
    print("\n")
    print(f"{final_calculation_print[0:-3]} = {round(final_calculation, 4)}")

In [24]:
headers = ["Outlook", "Yes", "No", "Number of instances"]
classes = [["Sunny", 1, 1, 2], ["Overcast", 3, 0, 3], ["Rain", 1, 1, 2]]
gini_score(headers, classes)

Gini (Outlook = Sunny)
1 - (1/2)^2 - (1/2)^2 = 1 - 0.25 - 0.25 = 0.5
Gini (Outlook = Overcast)
1 - (3/3)^2 - (0/3)^2 = 1 - 1.0 - 0.0 = 0.0
Gini (Outlook = Rain)
1 - (1/2)^2 - (1/2)^2 = 1 - 0.25 - 0.25 = 0.5


Gini (Outlook) = (2/7) * 0.5 + (3/7) * 0.0 + (2/7) * 0.5 = 0.2857


In [25]:
headers = ["Temperature", "Yes", "No", "Number of instances"]
classes = [["Hot", 1, 2, 3], ["Cool", 1, 0, 1], ["Mild", 2, 0, 2], ["Warm", 1, 0, 1]]
gini_score(headers, classes)

Gini (Temperature = Hot)
1 - (1/3)^2 - (2/3)^2 = 1 - 0.1111 - 0.4444 = 0.4445
Gini (Temperature = Cool)
1 - (1/1)^2 - (0/1)^2 = 1 - 1.0 - 0.0 = 0.0
Gini (Temperature = Mild)
1 - (2/2)^2 - (0/2)^2 = 1 - 1.0 - 0.0 = 0.0
Gini (Temperature = Warm)
1 - (1/1)^2 - (0/1)^2 = 1 - 1.0 - 0.0 = 0.0


Gini (Temperature) = (3/7) * 0.4445 + (1/7) * 0.0 + (2/7) * 0.0 + (1/7) * 0.0 = 0.1905


In [26]:
headers = ["Humidity", "Yes", "No", "Number of instances"]
classes = [["Humid", 1, 1, 2], ["Not Humid", 4, 1, 5]]
gini_score(headers, classes)

Gini (Humidity = Humid)
1 - (1/2)^2 - (1/2)^2 = 1 - 0.25 - 0.25 = 0.5
Gini (Humidity = Not Humid)
1 - (4/5)^2 - (1/5)^2 = 1 - 0.64 - 0.04 = 0.32


Gini (Humidity) = (2/7) * 0.5 + (5/7) * 0.32 = 0.3714


In [27]:
headers = ["Windy", "Yes", "No", "Number of instances"]
classes = [["Yes", 0, 1, 1], ["No", 5, 1, 6]]
gini_score(headers, classes)

Gini (Windy = Yes)
1 - (0/1)^2 - (1/1)^2 = 1 - 0.0 - 1.0 = 0.0
Gini (Windy = No)
1 - (5/6)^2 - (1/6)^2 = 1 - 0.6944 - 0.0278 = 0.2778


Gini (Windy) = (1/7) * 0.0 + (6/7) * 0.2778 = 0.2381
