In [10]:
import pandas as pd
import numpy as np
import re

data = pd.read_csv("./ASRS_data.csv", sep="|")

def naif_regex_tokenize(text):
    """
    This is a very naif way of tokenize a text. Just using the
    regular expression "[a-z]" that will match any single word
    in lowercase.
    Returns a list with all the tokens.
    """
    p = re.compile("[a-z]+")
    return p.findall(text.lower())

def compute_tf(d):
    """
    Compute the tf for a given document d.
    The formula used is

        tf(t, d) = 0.5 + 0.5 * (count(t, d)/max(count(t',d) for t' in d))

    This prevents bias in longer documents.

    count(t, d) represente le nombre de fois que apparait le mot dans ce document
    max(count(t', d)) represente le mot qui apparait le plus dans ce document
    """
    terms = pd.Series(naif_regex_tokenize(d))
    term_counts = terms.value_counts()
    max_tc = max(term_counts)
    return 0.5 + 0.5 * (term_counts / max_tc)

def compute_idf(D):
    """
    The input D is a list of pandas.Series
    having as each element, the term frequency
    computed by the function compute_tf.
    On divise le nombre de documents par le nombre de documents ou apparait chaque terme
    """
    N = len(D)
    all_terms = pd.concat(D)
    nt = all_terms.index.value_counts() # The number of documents containing the term "t"
    return np.log(N / nt)

def compute_tf_idf_document(tf_document, idf):
    """Compute the tf-idf for each term in a document of the corpus

    Keyword arguments:
    tf_document -- list with the frequency of each term inside the document
    idf -- the idf value for each term in the corpus
    """
    return tf_document * np.array([idf[i] for i in tf_document.index])

def compute_tf_idf_corpus(D):
    """Compute the tf-idf for each term in a corpus

    Keyword arguments:
    D -- pandas Series containing a collection of documents in text format

    returns
        list of pandas Series containing the tf-idf(t, d, D) for each term
        inside each document of the corpus D
    """
    term_freq = [compute_tf(d) for d in D]
    idf = compute_idf(term_freq)
    return [compute_tf_idf_document(d, idf) for d in term_freq]

s = data['summary'][0]
print(s)

D1 = data['summary'][:10]

tf_idf = compute_tf_idf_corpus(data.loc[:, "summary"])

tf_idf[1]

print(data["summary"][1])

tf_idf = np.array(tf_idf)



def distance(v1, v2):
    """
    Compute the distance between v1 and v2
    v1 and v2 are numpy arrays
    """
    return np.sqrt(np.sum((v1-v2)**2))

def assign(vectors, centers):
    """
    assign each vector to the closest center.
    vectors is a numpy matrix. We want to assign each
    row to the closest center.
    centers is a numpy matrix. Each row has a center

    returns a list of integers.
    One value for each vector indicaing the closest center
    """
    groups = np.zeros(vectors.shape[0])
    for i in range(len(groups)):
        groups[i] = np.argmin(np.apply_along_axis(distance, 1, centers, vectors[i]))
    return groups

def compute_centers(vectors, groups):
    """
    Compute the centers for each group of
    vectors
    vectors is a numpy matrix
    groups is a list containing the assignments
    of the vectors
    """
    new_centers = np.zeros([int(max(groups)) + 1, vectors.shape[1]])
    for i in range(int(max(groups)) + 1):
        ix = np.where(groups==i)[0]
        grp_members = vectors[ix, :]
        new_centers[i] = grp_members.mean(0)
    return new_centers

def choose_first_centers(vectors, k):
    """
    Select the first k centers for the beginning of the
    k-means algorithm
    """
    ix = np.arange(0, vectors.shape[0])
    np.random.shuffle(ix)
    return vectors[ix[:k], :]

def kmeans(vectors, k, max_iterations = 500):
    """
    Naive implementation of k-means algorithm
    """
    centers_list = []
    centers = choose_first_centers(vectors, k)
    centers_list.append(centers)
    groups = assign(vectors, centers)
    new_centers = compute_centers(vectors, groups)
    centers_list.append(new_centers)
    nb_iter = 0
    while (np.sum(np.abs(centers - new_centers)) > 0) or (nb_iter > max_iterations):
        centers = np.copy(new_centers)
        groups = assign(vectors, centers)
        new_centers = compute_centers(vectors, groups)
        centers_list.append(new_centers)
        nb_iter += 1
    return new_centers, centers_list

c, c_list = kmeans(vectors, 5)

v1 = np.random.random([100, 2])-[2, 0]
v2 = np.random.random([100, 2])+[2, 0]
v3 = np.random.random([100, 2])-[0, 2]
v4 = np.random.random([100, 2])+[0, 2]
norm_v1 = np.linalg.norm(v1, axis=1)
v1 = v1 / norm_v1[:, None]
norm_v2 = np.linalg.norm(v2, axis=1)
v2 = v2 / norm_v2[:, None]
norm_v3 = np.linalg.norm(v3, axis=1)
v3 = v3 / norm_v3[:, None]
norm_v4 = np.linalg.norm(v4, axis=1)
v4 = v4 / norm_v4[:, None]
vectors = np.concatenate([v1, v2, v3, v4])

df = pd.DataFrame({})
for i in range(len(centers_list)):
    v = {"x":vectors[:, 0], "y":vectors[:, 1], "p_type":["data_point"]*vectors.shape[0],
    "iteration":[i]*vectors.shape[0]}
    df = pd.concat([df, pd.DataFrame(v)])
    c = {"x":centers_list[i][:, 0], "y":centers_list[i][:, 1], "p_type":["center"]*centers_list[0].shape[0],
    "iteration":[i]*centers_list[0].shape[0]}
    df = pd.concat([df, pd.DataFrame(c)])

px.scatter(df, x="x", y="y", animation_frame="iteration", color="p_type")


KeyError: 'summary'