In [31]:
import csv
import pathlib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

CSV_PATH = pathlib.Path("f:\phd\draft thesis\quantitative methods\list of provisions.csv")
ID_COLUMN_NAME = "SECTION_NO"
TEXT_COLUMN_NAME = "SECTION_TEXT"

OUT_PATH = pathlib.Path("f:\phd\draft thesis\quantitative methods\TEXT SIMILARITIES v2.csv")

## Read text from CSV    

In [32]:
data = {}

with CSV_PATH.open() as _f:
    reader = csv.DictReader(_f)
    for row in reader:
        _id = row[ID_COLUMN_NAME]
        _text = row[TEXT_COLUMN_NAME]
        data[_id] = _text

In [33]:
data.keys()

dict_keys(['8', '12', '16', '22', '25', '31', '35', '37', '41', '50', '51', '74', '75', '76', '77', '79', '80', '81', '82', '83', '84', '85', '86', '87', '91', '92', '93', '94', '95', '98', '99', '101', '102', '114', '124', '129', '130', '131', '133', '138', '141', '142', '143', '145', '147', '148', '149', '150', '151', '153', '161', '162', '163', '164', '165', '170', '176', '180', '194', '195', '196', '197', '199', '201', '203', '206', '207', '209', '210'])

In [34]:
index = dict(zip(data.keys(), range(len(data.keys()))))
index

{'8': 0,
 '12': 1,
 '16': 2,
 '22': 3,
 '25': 4,
 '31': 5,
 '35': 6,
 '37': 7,
 '41': 8,
 '50': 9,
 '51': 10,
 '74': 11,
 '75': 12,
 '76': 13,
 '77': 14,
 '79': 15,
 '80': 16,
 '81': 17,
 '82': 18,
 '83': 19,
 '84': 20,
 '85': 21,
 '86': 22,
 '87': 23,
 '91': 24,
 '92': 25,
 '93': 26,
 '94': 27,
 '95': 28,
 '98': 29,
 '99': 30,
 '101': 31,
 '102': 32,
 '114': 33,
 '124': 34,
 '129': 35,
 '130': 36,
 '131': 37,
 '133': 38,
 '138': 39,
 '141': 40,
 '142': 41,
 '143': 42,
 '145': 43,
 '147': 44,
 '148': 45,
 '149': 46,
 '150': 47,
 '151': 48,
 '153': 49,
 '161': 50,
 '162': 51,
 '163': 52,
 '164': 53,
 '165': 54,
 '170': 55,
 '176': 56,
 '180': 57,
 '194': 58,
 '195': 59,
 '196': 60,
 '197': 61,
 '199': 62,
 '201': 63,
 '203': 64,
 '206': 65,
 '207': 66,
 '209': 67,
 '210': 68}

## Vectorise text

Use a [tf-idf vectoriser](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) to tokenise text.

More info on the [sklearn feature extraction page](https://scikit-learn.org/stable/modules/feature_extraction.html).

Also [sklearn page on working with text data](https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html).

In [35]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2)
X = vectorizer.fit_transform(data.values())

In [36]:
print(f"Num words in vocabulary: {len(vectorizer.get_feature_names())}")

Num words in vocabulary: 574


In [37]:
print(f"(Num texts, Num vocab words): {X.shape}")

(Num texts, Num vocab words): (69, 574)


## View tfidf vector for a text



In [38]:
def get_feature_dict(key):
    txt_idx = index[key]
    V = X[txt_idx]
    Vl = V.tolil()
    feature_dict = {}
    for tok_idx, tfidf in zip(*Vl.rows, *Vl.data):
        tok_str = vectorizer.get_feature_names()[tok_idx]
        feature_dict[tok_str] = tfidf
    return feature_dict

In [39]:
get_feature_dict("8")

{'addition': 0.09720939359727047,
 'after': 0.05644416391793208,
 'amount': 0.27701722550560454,
 'association': 0.1630159374539253,
 'capital': 0.09233907516853486,
 'declaration': 0.11085497392823201,
 'divided': 0.10317018161010706,
 'each': 0.07869349483757332,
 'england': 0.07614749351271792,
 'fixed': 0.11085497392823201,
 'following': 0.15229498702543584,
 'formed': 0.08822127961014528,
 'having': 0.07614749351271792,
 'he': 0.07869349483757332,
 'herein': 0.07614749351271792,
 'his': 0.06299206996726549,
 'into': 0.06970538085044811,
 'ireland': 0.08150796872696266,
 'its': 0.05763159475302075,
 'kingdom': 0.11085497392823201,
 'last': 0.09233907516853486,
 'less': 0.09720939359727047,
 'liability': 0.15229498702543584,
 'limited': 0.3386171314016396,
 'members': 0.13227676818142547,
 'memorandum': 0.18467815033706972,
 'name': 0.23608048451271996,
 'no': 0.0678623883960011,
 'number': 0.08150796872696266,
 'office': 0.07869349483757332,
 'on': 0.09524497066203115,
 'one': 0.06

## Calculate similarities

Calculate cosine similarity between each pair of texts.

In [40]:
S = cosine_similarity(X)

In [41]:
def sim(key1, key2, kernel=S):
    idx1 = index[key1]
    idx2 = index[key2]
    return S[idx1, idx2]

In [42]:
sim('8', '12')

0.38277761096501695

## Most similar for given key

In [43]:
def most_sim(key, kernel=S):
    idx = index[key]
    sims = list(zip(S[idx], index.keys()))
    return sorted(sims, reverse=True)

In [44]:
most_sim('8')

[(1.0000000000000002, '8'),
 (0.38277761096501695, '12'),
 (0.27915556867069735, '25'),
 (0.25632827894340726, '176'),
 (0.2547251432647411, '41'),
 (0.23300020880534209, '22'),
 (0.2179430040787526, '196'),
 (0.19811229863832847, '180'),
 (0.19344491392414703, '161'),
 (0.18148579740988863, '95'),
 (0.17999732225439707, '16'),
 (0.17857648981528715, '199'),
 (0.15279082734770075, '94'),
 (0.1483729341202468, '81'),
 (0.14001592368633314, '50'),
 (0.1389851294491498, '31'),
 (0.13627054400681618, '210'),
 (0.1311989225041349, '101'),
 (0.12540225861245993, '79'),
 (0.12408845133037036, '203'),
 (0.11148761532485377, '75'),
 (0.1112343221123979, '129'),
 (0.10812861670983617, '51'),
 (0.10809616329565404, '153'),
 (0.10455586042390519, '35'),
 (0.09365319467237374, '142'),
 (0.09314122679793117, '131'),
 (0.09270782526315954, '148'),
 (0.09186266802258304, '165'),
 (0.0899020134730844, '133'),
 (0.08869868653845643, '209'),
 (0.08736379127355254, '194'),
 (0.07892360205032169, '138'),
 

## Write similarity CSV

In [45]:
with open(OUT_PATH, 'w') as _f:
    writer = csv.writer(_f)
    writer.writerow(["key1", "key2", "similarity"])
    keys = list(index.keys())
    N = len(keys)
    for i in range(N):
        key1 = keys[i]
        for j in range(i+1, N):
            key2 = keys[j]
            similarity = S[i][j]
            writer.writerow([key1, key2, similarity])