### WORKING WITH PREPROCESSINGS IN NLP

### Importing libraries

In [1]:
import pandas as pd
import os
import csv
import re
import nltk
from nltk.stem import WordNetLemmatizer
import torch
from sentence_transformers import SentenceTransformer
import spacy
import faiss
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
# defining the path to parent folder
parent_folder = 'C:/Users/personal/Documents/Dictionary in csv'

In [4]:
# looping through each file in the folder
new_list = []
for file_name in os.listdir(parent_folder):
    if file_name.endswith('.csv'):
        with open(os.path.join(parent_folder, file_name), 'r') as csv_file:
            csv_reader = csv.reader(csv_file)
            for row in csv_reader:
                for item in row:
                     if '(pl. )' not in item:
                        new_list.append(item)

### Text preprocessing

#### view unprocessed list

In [5]:
# new_list

#### using regular expression to further clean

In [6]:
# using regex to match
pattern =  re.compile(r'\([^\)]*\)')
for string in new_list:
    match_re = re.match(pattern, string)

In [7]:
# splitting with regular expression
Pattern_2 = re.compile(r'\([^\)]*\)')
list_2 = []
for string in new_list:
    if len(re.split(pattern, string, maxsplit=0)) > 1:
        list_2.append(re.split(pattern, string, maxsplit=0))
    else:
        list_2.append(string)

#### view splitted list

In [8]:
# list_2

In [9]:
# separating the list of words and meanings into key and value pairs of a dictionary
result = {}
for sentence in list_2:
    key = sentence[0] # the first word is the key
    values = sentence[1:] # rest of the words are values
    result[key] = values # add key-value pairs to dictionary

#### viewing the new dictionary

In [10]:
result

{'A ': [' An expletive, void of sense, to fill up the meter'],
 'A- ': [' A, as a prefix to English words, is derived from various sources. ',
  ' It frequently signifies on or in ',
  ', denoting a state, as in afoot, on foot, abed, amiss, asleep, aground, aloft, away ',
  ', and analogically, ablaze, atremble, etc. ',
  ' AS. of off, from, as in adown ',
  '. ',
  ' AS. a- ',
  ', usually giving an intensive force, and sometimes the sense of away, on, back, as in arise, abide, ago. ',
  ' Old English y- or i- ',
  ', which, as a prefix, made no essential addition to the meaning, as in aware. ',
  ' French a ',
  ', as in abase, achieve. ',
  ' L. a, ab, abs, from, as in avert. ',
  ' Greek insep. prefix / without, or privative, not, as in abyss, atheist; akin to E. un-.'],
 'A 1 ': [' A registry mark given by underwriters ',
  ' to ships in first-class condition. Inferior grades are indicated by A 2 and A 3.'],
 'Aam ': [' A Dutch and German measure of liquids, varying in different c

In [11]:
# the above still has some issues so we will use regex to correct the KeyError
pattern_3 = re.compile(r'\)')
problem_keys = [key for key in result.keys() if pattern_3.search(key)]
# removing the problem key
for key in problem_keys:
    del result[key]

In [13]:
spacy_lem_model = spacy.load('en_core_web_sm')

In [14]:
lemmatized_dict = {}
for key in result:
    doc = spacy_lem_model(key)
    lemmatized_key = doc[0].lemma_
    lemmatized_dict[lemmatized_key] = result[key]

#### viewing the new dict with lemmatized key

In [18]:
lemmatized_dict

{'a': [' On tiptoe; eagerly expecting.'],
 'a-': [' A, as a prefix to English words, is derived from various sources. ',
  ' It frequently signifies on or in ',
  ', denoting a state, as in afoot, on foot, abed, amiss, asleep, aground, aloft, away ',
  ', and analogically, ablaze, atremble, etc. ',
  ' AS. of off, from, as in adown ',
  '. ',
  ' AS. a- ',
  ', usually giving an intensive force, and sometimes the sense of away, on, back, as in arise, abide, ago. ',
  ' Old English y- or i- ',
  ', which, as a prefix, made no essential addition to the meaning, as in aware. ',
  ' French a ',
  ', as in abase, achieve. ',
  ' L. a, ab, abs, from, as in avert. ',
  ' Greek insep. prefix / without, or privative, not, as in abyss, atheist; akin to E. un-.'],
 'aam': [' A Dutch and German measure of liquids, varying in different cities, being at Amsterdam about 41 wine gallons, at Antwerp 36 1/2, at Hamburg 38 1/4.'],
 'Aard': [' An edentate mammal, of the genus Orycteropus, somewhat resembl

In [19]:
# looping through the lemmatized dictionary and append values of repeating keys
lemma_appended_dict = {}
for key, value in lemmatized_dict.items():
    if key in lemma_appended_dict:
        lemma_appended_dict[key].extend(values)
    else:
        lemma_appended_dict[key] = value

#### view appended value pair dictionary

In [20]:
#lemma_appended_dict

# Creating a function to choose batches of dictionary(Letter G) 

In [21]:
# First create a function receiving the first letter of the key as input
def filtered_dict(lemma_appended_dict, letter:'str'):
    # regex pattern checking if first letter of key is given
    pattern = re.compile(r'^{}\w*'.format(letter), re.IGNORECASE)
    # filter keys
    filtered_keys = list(filter(pattern.match, lemma_appended_dict.keys()))
    return {key:lemma_appended_dict[key] for key in filtered_keys}

In [22]:
Letter_G_dict = filtered_dict(lemma_appended_dict, 'G')

In [23]:
sentences_G = [value for key, value in Letter_G_dict.items()]

### Looking at the list of sentences from a given Alphabet G

In [24]:
sentences_G

[[' of Gum'],
 [" A roll recting the several stages arranged for a royal progress. Many of them are extant in the herald's office."],
 [' To talk idly; to prate; to chatter.'],
 [' A kind of coarse cloth for packing goods.'],
 [' Alt. of Gaberdine'],
 [' See Gabardine.'],
 [' One addicted to idle talk.'],
 [' Inarticulate sounds rapidly uttered; as of fowls.'],
 [' One who gabbles; a prater.'],
 [' A name originally given by the Italians to a kind of serpentine, later to the rock called euphotide, and now generally used for a coarsely crystalline, igneous rock consisting of lamellar pyroxene ',
  ' and labradorite, with sometimes chrysolite ',
  '.'],
 [' A rent, service, tribute, custom, tax, impost, or duty; an excise.'],
 [' A collector of gabels or taxes.'],
 [' A tax, especially on salt.'],
 [' A gabeler.'],
 [' A beggar with a wallet; a licensed beggar.'],
 [' A lighter, or vessel for inland navigation.'],
 [' An openwork frame, as of poles, filled with stones and sunk, to assist

In [25]:
# checking the lenght of sentences in a list(max and min values) of each list...
max_words = float('-inf')
min_words = float('inf')
for _list in sentences_G:
    word_lenght = len(_list)
    if word_lenght > max_words:
        max_words = word_lenght
    if word_lenght < min_words:
        min_words = word_lenght

#### stopwords removal

In [26]:
stop_words = spacy.load('en_core_web_sm')

In [27]:
#removing stopword from a single sentence
def rem_stopwords(sentence):
    item = stop_words(sentence)
    return ''.join([token.text for token in item if not token.is_stop and not token.is_punct])

In [28]:
# applying function to sentences in many lists
sentence_stopped_G = [[sentence for sentence in lst if rem_stopwords(sentence).strip()]for lst in sentences_G]

In [29]:
sentence_stopped_G

[[' of Gum'],
 [" A roll recting the several stages arranged for a royal progress. Many of them are extant in the herald's office."],
 [' To talk idly; to prate; to chatter.'],
 [' A kind of coarse cloth for packing goods.'],
 [' Alt. of Gaberdine'],
 [' See Gabardine.'],
 [' One addicted to idle talk.'],
 [' Inarticulate sounds rapidly uttered; as of fowls.'],
 [' One who gabbles; a prater.'],
 [' A name originally given by the Italians to a kind of serpentine, later to the rock called euphotide, and now generally used for a coarsely crystalline, igneous rock consisting of lamellar pyroxene ',
  ' and labradorite, with sometimes chrysolite '],
 [' A rent, service, tribute, custom, tax, impost, or duty; an excise.'],
 [' A collector of gabels or taxes.'],
 [' A tax, especially on salt.'],
 [' A gabeler.'],
 [' A beggar with a wallet; a licensed beggar.'],
 [' A lighter, or vessel for inland navigation.'],
 [' An openwork frame, as of poles, filled with stones and sunk, to assist in for

In [30]:
# looping through to find empty strings/list and drop them
for list_empty in sentence_stopped_G:
    for null in list_empty:
        if not null.strip():
            list_empty.remove(null)

### Embedding the value pairs

In [31]:
class MySentenceTransformer:
    def __init__(self, model_name:'str'):
        self.model = SentenceTransformer(model_name)
        
    def encode(self, sentences):
        embeddings = self.model.encode(sentences)
        return embeddings

In [32]:
# Instantiate a model 
model = MySentenceTransformer('all-MiniLM-L6-v2')

#### Creating Dictionaries for hash ID, index file, lemma and putting all together with embeddings for letter G

In [33]:
encoded_list = []
for sentence in sentence_stopped_G:
    embedding = model.encode(sentence)
    encoded_list.extend(embedding)

In [34]:
# convert encoded embedding to numpy array
array_embedding = np.array(encoded_list)

In [35]:
# checking for the dimension of the embedding
def get_embedding_dimension(embeddings):
    if len(embeddings) == 0:
        raise ValueError('empty list embeddings')
        
    first_embedding_dim = len(embeddings[0][0])
    
    for emb in embeddings:
        for vector in emb:
            if len(vector) != first_embedding_dim:
                raise ValueError('dimensions are not consistent')
    return first_embedding_dim

#### hash ID and lemma

In [36]:
# creating a new dictionary to store hash IDs and keys
hash_dict = {}
# iterate over keys in letter A dictionary
for key in Letter_G_dict.keys():
    #get hash IDs using SHA_256
    key_hash = abs(hash(key)) % (10 ** 8)
    
    # add hash ID and key to new dictionary
    hash_dict[key_hash] = key

In [37]:
# viewing the new hash and lemma dictionary
hash_dict

{71248863: 'gum',
 15221651: 'g',
 18416437: 'Gab',
 29488653: 'gabarage',
 65413202: 'gabardine',
 72767082: 'gaberdine',
 2879559: 'Gabber',
 27896062: 'gabble',
 69338340: 'Gabbier',
 23153238: 'gabbro',
 11233324: 'Gabel',
 41386087: 'Gabeler',
 63387628: 'Gabelle',
 14999018: 'Gabelleman',
 17648346: 'Gaber',
 27382330: 'Gabert',
 4660769: 'gabion',
 4433967: 'Gabionade',
 48532799: 'gabionage',
 9024407: 'gabione',
 50218592: 'gabionnade',
 53869344: 'gable',
 62802798: 'Gablet',
 35489277: 'gablock',
 69391805: 'Gaby',
 57726216: 'Gad',
 22644767: 'gad',
 3589387: 'gadabout',
 16356154: 'Gadbee',
 81135340: 'gadder',
 25923424: 'gaddingly',
 15197908: 'gaddish',
 89715604: 'Gade',
 70920982: 'Gadere',
 56484583: 'Gadre',
 73538135: 'gadfly',
 20000635: 'gadhelic',
 62747436: 'gadic',
 71594507: 'gaditanian',
 85957053: 'gadle',
 98058721: 'Gadman',
 23219823: 'Gadoid',
 78983884: 'Gadolinia',
 39351940: 'Gadolinic',
 82859148: 'Gadolinite',
 54992355: 'gadolinium',
 89014334: 'G

#### lemma and embeddings

In [38]:
lemma_embed = {}
for key, value in zip(Letter_G_dict.keys(), encoded_list):
    lemma_embed[key] = value

In [39]:
# viewing the new lemma and embedding dictionary
lemma_embed

{'gum': array([-8.72751921e-02, -4.88030771e-03, -1.16230072e-02,  2.96924151e-02,
        -6.58879280e-02, -5.02167642e-02,  1.37691468e-01,  1.72115024e-02,
         5.38313165e-02,  2.92063300e-02, -2.14753244e-02,  2.84401141e-03,
        -6.76430855e-03,  3.19210961e-02,  1.69527121e-02,  1.77104899e-03,
        -2.32775621e-02,  3.65620814e-02, -6.89980611e-02,  3.62796709e-02,
         5.61528243e-02,  7.44818300e-02,  3.18790302e-02,  3.34497243e-02,
        -1.27110081e-02,  3.65726091e-02, -3.87230329e-02, -7.08698258e-02,
         1.01689696e-02,  1.23084793e-02,  8.13624561e-02,  3.11525930e-02,
         1.26445144e-01, -5.03494330e-02,  2.61264667e-03, -6.79598004e-02,
        -8.10741633e-03,  1.52953966e-02,  7.58641288e-02,  4.35712300e-02,
        -4.41054888e-02, -5.91116212e-03,  2.84701735e-02,  1.91031136e-02,
         4.00610156e-02, -4.41994630e-02, -1.17345313e-02,  5.02010994e-02,
         4.25400473e-02,  1.29827894e-02, -8.17749370e-03, -4.56204042e-02,
     

#### hash ID and embedding index with faiss

In [40]:
# getting the hash IDs in form of a list from the key of the dictionary above
ID_list = [key for key in hash_dict.keys()]

In [41]:
# convert hash ID to type int
ID_int_array = np.array(ID_list).astype(int)

In [42]:
d = embedding.shape[1]

In [43]:
# creating an index file for the hash IDs and the embeddings
index = faiss.IndexFlatL2(d)
indexidmap = faiss.IndexIDMap2(index)

In [44]:
index.is_trained

True

In [45]:
indexidmap.add_with_ids(array_embedding[:3], ID_int_array[:3])