In [504]:
from functools import reduce
from random import seed
TEXT1= open("data/french/1.txt").read()
TEXT2 = open("data/french/2.txt").read()
TEXT3 = open("data/french/3.txt").read()
TEXT4 = open("data/french/4.txt").read()

TEXTS = [TEXT1, TEXT2, TEXT3, TEXT4]

In [505]:
[len(text) for text in TEXTS]

[10202, 10965, 8067, 10609]

In [506]:
import re
exclude = r"[0123456789/%²°+()]"
replace = r"[;:?.!,«»\n“”]"

def process(text: str) -> str:
    text = re.compile(exclude).sub("", text)
    text = re.compile(replace).sub(" ", text)
    text = text.lower()
    return text

TEXTS = [process(text) for text in TEXTS]

In [507]:
import itertools
symbols = set(itertools.chain.from_iterable(TEXTS))
symbols

{' ',
 "'",
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'à',
 'â',
 'ç',
 'è',
 'é',
 'ê',
 'î',
 'ï',
 'ô',
 'ù',
 'û',
 'œ',
 '—',
 '’'}

In [508]:
from typing import List

def tokenize(text: str) -> List[str]:
    return [m.group(0) for m in re.compile(r"[\w’']+").finditer(text)]

tokens = [tokenize(text) for text in TEXTS]
tokens

[['l’ètonnante',
  'rèmunèration',
  'que',
  'versait',
  'france',
  'inter',
  'à',
  'bernard',
  'guetta',
  'son',
  'chroniqueur',
  'vedette',
  'de',
  'la',
  'chute',
  'de',
  'l’urss',
  'à',
  'la',
  'poignèe',
  'de',
  'mains',
  'historique',
  'entre',
  'donald',
  'trump',
  'et',
  'kim',
  'jong',
  'un',
  'le',
  'journaliste',
  'bernard',
  'guetta',
  'a',
  'inlassablement',
  'dècryptè',
  'la',
  'marche',
  'du',
  'monde',
  'durant',
  'ses',
  'annèes',
  'aux',
  'commandes',
  'de',
  'la',
  'chronique',
  'gèopolitique',
  'de',
  'france',
  'inter',
  'une',
  'longèvitè',
  'exceptionnelle',
  'qui',
  'a',
  'valu',
  'à',
  'l’eurodèputè',
  'macroniste',
  'un',
  'traitement',
  'de',
  'faveur',
  'au',
  'sein',
  'de',
  'la',
  'station',
  'avant',
  'qu’il',
  'passe',
  'la',
  'main',
  'à',
  'pierre',
  'haski',
  'à',
  'la',
  'rentrèe',
  'france',
  'inter',
  'lui',
  'versait',
  'ainsi',
  'un',
  'salaire',
  'net',
  'ann

In [509]:
[len(set(text_tokens)) for text_tokens in tokens]

[708, 790, 536, 712]

In [510]:
len(set(itertools.chain.from_iterable(tokens)))

2123

In [511]:
import numpy as np

def build_table(tokens: List[List[str]]) -> (List[str], np.array):
    lexicon = list(set(itertools.chain.from_iterable(tokens)))
    n = len(lexicon)
    k = len(tokens)
    
    index = {w: i for i, w in enumerate(lexicon)}
    matrix = np.zeros((n,k))
    for i, token_set in enumerate(tokens):
        for word in token_set:
            matrix[index[word]][i]+=1
    return lexicon, matrix

lexicon, matrix = build_table(tokens)
matrix

array([[0., 0., 1., 0.],
       [0., 0., 2., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.]])

In [512]:
matrix = np.sqrt(matrix)
matrix

array([[0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 1.41421356, 1.        ],
       [0.        , 0.        , 0.        , 1.        ],
       ...,
       [0.        , 1.        , 0.        , 0.        ],
       [0.        , 1.        , 0.        , 0.        ],
       [0.        , 0.        , 1.        , 0.        ]])

In [513]:
def normalize(m : np.array) -> np.array:
    a = np.zeros(m.shape)
    n, k = m.shape
    for i in range(n):
        ni = 0
        for j in range(k):
            if m[i][j] > 0:
                ni +=1
        id = np.log(k/ni)
        for j in range(k):
            a[i][j] = m[i][j] * id
    return a
matrix = normalize(matrix)
matrix

array([[0.        , 0.        , 1.38629436, 0.        ],
       [0.        , 0.        , 0.98025814, 0.69314718],
       [0.        , 0.        , 0.        , 1.38629436],
       ...,
       [0.        , 1.38629436, 0.        , 0.        ],
       [0.        , 1.38629436, 0.        , 0.        ],
       [0.        , 0.        , 1.38629436, 0.        ]])

In [514]:
def remove_common(a: np.array) -> np.array:
    return a[np.prod(a, axis=1) == 0]

matrix = remove_common(matrix)
matrix

array([[0.        , 0.        , 1.38629436, 0.        ],
       [0.        , 0.        , 0.98025814, 0.69314718],
       [0.        , 0.        , 0.        , 1.38629436],
       ...,
       [0.        , 1.38629436, 0.        , 0.        ],
       [0.        , 1.38629436, 0.        , 0.        ],
       [0.        , 0.        , 1.38629436, 0.        ]])

In [515]:
def calc_distances(matrix: np.array) -> np.array:
    n, k = matrix.shape
    d = np.zeros((k,k))
    for i in range(k):
        for j in range(k):
            a = matrix[:, i]
            b = matrix[:, j]
            d[i][j] = np.dot(a.reshape((1,n)), b.reshape((n,1)))/\
                      (np.sqrt(np.sum(np.power(a, 2)))*np.sqrt(np.sum(np.power(b, 2))))
    return d

distances = calc_distances(matrix)
distances

array([[1.        , 0.03982039, 0.02002469, 0.03020083],
       [0.03982039, 1.        , 0.0206269 , 0.02750137],
       [0.02002469, 0.0206269 , 1.        , 0.07581763],
       [0.03020083, 0.02750137, 0.07581763, 1.        ]])

In [516]:
def calc_jaccard() -> np.array:
    k = len(tokens)
    sets = list(map(set, tokens))
    m = np.zeros((k,k))
    for i, t1 in enumerate(sets):
        for j, t2 in enumerate(sets):
            if i == j:
                res = 1
            else:
                res = (len(t1.intersection(t2))/len(t1.union(t2)))
            m[i][j] = res
    return m
jaccard = calc_jaccard()
jaccard

array([[1.        , 0.11541325, 0.10088496, 0.11899133],
       [0.11541325, 1.        , 0.09405941, 0.10766962],
       [0.10088496, 0.09405941, 1.        , 0.18181818],
       [0.11899133, 0.10766962, 0.18181818, 1.        ]])