In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import re
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import pandas as pd

In [3]:
df = pd.read_csv("/content/papers.csv")

df

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
...,...,...,...,...,...,...,...
7236,994,1994,Single Transistor Learning Synapses,,994-single-transistor-learning-synapses.pdf,Abstract Missing,Single Transistor Learning Synapses\n\nPaul Ha...
7237,996,1994,"Bias, Variance and the Combination of Least Sq...",,996-bias-variance-and-the-combination-of-least...,Abstract Missing,"Bias, Variance and the Combination of\nLeast S..."
7238,997,1994,A Real Time Clustering CMOS Neural Engine,,997-a-real-time-clustering-cmos-neural-engine.pdf,Abstract Missing,A Real Time Clustering CMOS\nNeural Engine\nT....
7239,998,1994,Learning direction in global motion: two class...,,998-learning-direction-in-global-motion-two-cl...,Abstract Missing,Learning direction in global motion: two\nclas...


In [4]:
nltk.download('stopwords')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
stop_words = set(stopwords.words('english'))
##Creating a list of custom stopwords
new_words = ["fig","figure","image","sample","using",
             "show", "result", "large",
             "also", "one", "two", "three",
             "four", "five", "seven","eight","nine"]
stop_words = list(stop_words.union(new_words))

In [6]:

def pre_process(text):

    # lowercase
    text=text.lower()

    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)

    ##Convert to list from string
    text = text.split()

    # remove stopwords
    text = [word for word in text if word not in stop_words]

    # remove words less than three letters
    text = [word for word in text if len(word) >= 3]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    text = [lmtzr.lemmatize(word) for word in text]

    return ' '.join(text)

In [7]:
docs = df['paper_text'].iloc[:3000].apply(lambda x:pre_process(x))


In [8]:
docs.shape


(3000,)

In [9]:
sentences = docs.tolist()
len(sentences)

3000

In [10]:
sentences[:2]


['self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding 

In [11]:
text_data = ' '.join(sentences)
text_data[:1000]

'self organization associative database application hisashi suzuki suguru arimoto osaka university toyonaka osaka japan abstract efficient method self organizing associative database proposed together application robot eyesight system proposed database associate input output first half part discussion algorithm self organization proposed aspect hardware produce new style neural network latter half part applicability handwritten letter recognition autonomous mobile robot system demonstrated introduction let mapping given finite infinite set another finite infinite set learning machine observes set pair sampled randomly mean cartesian product computes estimate make small estimation error measure usually say faster decrease estimation error increase number sample better learning machine however expression performance incomplete since lack consideration candidate assumed preliminarily find good learning machine clarify conception let discus type learning machine let advance understanding s

In [12]:
def misc(file_name):
    words = []
    file_name = process_tweet(file_name)
    words = re.findall(r'\w+', file_name)
    return words

def process_tweet(tweet):
      tweet = re.sub(r'\$\w*', '', tweet)
      tweet = re.sub(r'^RT[\s]+', '', tweet)
      tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
      tweet = re.sub(r'#', '', tweet)

      return tweet

In [13]:
words = misc(text_data)
vocab = set(words)
print(f"There are {len(vocab)} unique words in the vocabulary.")

There are 90733 unique words in the vocabulary.


In [39]:
words = re.findall(r'\w+', text_data)
print(len(words))
vocab = set(words)
print(len(vocab))

5482203
90733


In [42]:
def get_count(word_l):
    """
    Input:
        word_l: a set of words representing the corpus.
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    """
    word_count_dict = {}
    word_count_dict = Counter(word_l)

    return word_count_dict

In [43]:
word_count_dict = get_count(words)


In [44]:
word_count_dict['infinite']


1484

In [45]:
def get_probs(word_count_dict):
    """
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur.
    """
    probs = {}  # return this variable correctly
    m = sum(word_count_dict.values())
    for key in word_count_dict.keys():
        probs[key] = word_count_dict.get(key, 0) / m

    return probs

In [46]:
probs = get_probs(word_count_dict)


# Part 2: String Manipulation


In [20]:
# delete_letter()
def delete_letter(word, verbose=False):
    delete_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    delete_l = [L + R[1:] for L, R in split_l if R]

    if verbose:
        print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")# printing implicitly.

    return delete_l

In [21]:
# checking the function
print(delete_letter(word="cans", verbose=True))

input word cans, 
split_l = [('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's')], 
delete_l = ['ans', 'cns', 'cas', 'can']
['ans', 'cns', 'cas', 'can']


In [22]:
# switch_letter()
def switch_letter(word, verbose=False):
    def swap(c, i, j):
        c = list(c)
        c[i], c[j] = c[j], c[i]
        return ''.join(c)

    switch_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    switch_l = [a + b[1] + b[0] + b[2:] for a, b in split_l if len(b) >= 2]

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}")

    return switch_l

In [23]:
print(switch_letter(word="eta", verbose=True))


Input word = eta 
split_l = [('', 'eta'), ('e', 'ta'), ('et', 'a')] 
switch_l = ['tea', 'eat']
['tea', 'eat']


In [24]:
# replace_letter()
def replace_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_l = []
    split_l = []

    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    replace_l = [a + l + (b[1:] if len(b) > 1 else '') for a, b in split_l if b for l in letters]
    replace_set = set(replace_l)
    replace_set.remove(word)
    # turn the set back into a list and sort it, for easier viewing
    replace_l = sorted(list(replace_set))

    if verbose:
        print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")

    return replace_l

In [25]:
print(replace_letter(word='can', verbose=True))


Input word = can 
split_l = [('', 'can'), ('c', 'an'), ('ca', 'n')] 
replace_l ['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'han', 'ian', 'jan', 'kan', 'lan', 'man', 'nan', 'oan', 'pan', 'qan', 'ran', 'san', 'tan', 'uan', 'van', 'wan', 'xan', 'yan', 'zan']
['aan', 'ban', 'caa', 'cab', 'cac', 'cad', 'cae', 'caf', 'cag', 'cah', 'cai', 'caj', 'cak', 'cal', 'cam', 'cao', 'cap', 'caq', 'car', 'cas', 'cat', 'cau', 'cav', 'caw', 'cax', 'cay', 'caz', 'cbn', 'ccn', 'cdn', 'cen', 'cfn', 'cgn', 'chn', 'cin', 'cjn', 'ckn', 'cln', 'cmn', 'cnn', 'con', 'cpn', 'cqn', 'crn', 'csn', 'ctn', 'cun', 'cvn', 'cwn', 'cxn', 'cyn', 'czn', 'dan', 'ean', 'fan', 'gan', 'h

In [26]:
#  insert_letter()
def insert_letter(word, verbose=False):
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    insert_l = [ a + l + b for a, b in split_l for l in letters]

    if verbose:
        print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")

    return insert_l

In [27]:
print(insert_letter(word='at', verbose=False))


['aat', 'bat', 'cat', 'dat', 'eat', 'fat', 'gat', 'hat', 'iat', 'jat', 'kat', 'lat', 'mat', 'nat', 'oat', 'pat', 'qat', 'rat', 'sat', 'tat', 'uat', 'vat', 'wat', 'xat', 'yat', 'zat', 'aat', 'abt', 'act', 'adt', 'aet', 'aft', 'agt', 'aht', 'ait', 'ajt', 'akt', 'alt', 'amt', 'ant', 'aot', 'apt', 'aqt', 'art', 'ast', 'att', 'aut', 'avt', 'awt', 'axt', 'ayt', 'azt']


In [28]:
# Combining the edits:
# Now that you have implemented the string manipulations, you will create two functions that,
#  given a string, will return all the possible single and double edits on that string. These will
#  be edit_one_letter() and edit_two_letters().

In [29]:
#  Edit one letter
def edit_one_letter(word, allow_switches=True):

    edit_one_set = set()
    edit_one_set.update(delete_letter(word))
    if allow_switches:
        edit_one_set.update(switch_letter(word))
    edit_one_set.update(replace_letter(word))
    edit_one_set.update(insert_letter(word))

    return edit_one_set

In [30]:
# Edit two letters
def edit_two_letters(word, allow_switches=True):

    edit_two_set = set()
    edit_one = edit_one_letter(word, allow_switches=allow_switches)
    for w in edit_one:
        if w:
            edit_two = edit_one_letter(w, allow_switches=allow_switches)
            edit_two_set.update(edit_two)

    return edit_two_set


In [31]:
# proposed
edit_two_letters('propose')

{'poopfose',
 'yroposu',
 'gpropdose',
 'proposlb',
 'kpronpose',
 'prpotse',
 'prohtpose',
 'progpoyse',
 'bpropoye',
 'cpropcose',
 'proposra',
 'prtofpose',
 'dopropose',
 'propses',
 'pryzpose',
 'prmoposhe',
 'bprwopose',
 'proljpose',
 'proyposve',
 'propdouse',
 'drorose',
 'pxropofe',
 'rroxpose',
 'pnopeose',
 'pyopoase',
 'pjroposoe',
 'lropxse',
 'puroposz',
 'protjse',
 'hwropose',
 'probosd',
 'proposbl',
 'plropcse',
 'prohosye',
 'prrdopose',
 'pjopqose',
 'vrhopose',
 'propxosf',
 'prlozose',
 'vrozpose',
 'propolhe',
 'pronobe',
 'pqxopose',
 'propeosve',
 'ropsse',
 'uproposv',
 'aropise',
 'prvposze',
 'przpyse',
 'kpropaose',
 'upropos',
 'prjopost',
 'provposf',
 'prjcpose',
 'pzroxose',
 'yproplse',
 'prpnose',
 'proopse',
 'propoeqs',
 'prsoprose',
 'pkropode',
 'oropopse',
 'mroxose',
 'proypofe',
 'propmyse',
 'porpzse',
 'vqpropose',
 'oropqose',
 'prjopoke',
 'propousv',
 'pgopohse',
 'ssropose',
 'pvroptse',
 'pnropkose',
 'prqoupose',
 'prxyose',
 'wropoke'

In [32]:
# suggest spelling suggestions
def get_corrections(word, probs, vocab, verbose=False):
    """
    Input:
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output:
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    """

    suggestions = []
    n_best = []
    #suggestions = list((word in vocab) or edit_one_letter(word).intersection(vocab) or
    #                   edit_two_letters(word).intersection(vocab))
    suggestions = list(edit_two_letters(word).intersection(vocab))
    # suggestions = list(edit_two_letters(word, False).intersection(vocab))
    n_best = [[s, probs.get(s, -1)] for s in list(reversed(suggestions))]

    if verbose:
        print("suggestions = ", suggestions)

    return n_best

In [33]:
# Testing
my_word = 'propose'
tmp_corrections = get_corrections(my_word, probs, vocab, verbose=False)
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

word 0: oppose, probability 0.000000
word 1: proposal, probability 0.000129
word 2: proposed, probability 0.000829
word 3: propoesd, probability 0.000000
word 4: provost, probability 0.000004
word 5: proove, probability 0.000000
word 6: proposer, probability 0.000000
word 7: protos, probability 0.000000
word 8: proposes, probability 0.000030
word 9: purpose, probability 0.000222
word 10: prope, probability 0.000000
word 11: repose, probability 0.000000
word 12: promote, probability 0.000005
word 13: porpoise, probability 0.000000
word 14: proposi, probability 0.000000
word 15: apropos, probability 0.000000
word 16: propo, probability 0.000000
word 17: prose, probability 0.000001
word 18: rpose, probability 0.000000
word 19: propose, probability 0.000376
word 20: promise, probability 0.000021
word 21: prognose, probability 0.000000
word 22: roose, probability 0.000001
word 23: propoi, probability 0.000000
word 24: provoke, probability 0.000001
word 25: propoj, probability 0.000000
word 

In [34]:
# Testing
my_word = 'proose'
tmp_corrections = get_corrections(my_word, probs, vocab, verbose=False)
for i, word_prob in enumerate(tmp_corrections):
    print(f"word {i}: {word_prob[0]}, probability {word_prob[1]:.6f}")

word 0: proposi, probability 0.000000
word 1: procs, probability 0.000001
word 2: proce, probability 0.000001
word 3: prosop, probability 0.000000
word 4: moose, probability 0.000000
word 5: provost, probability 0.000004
word 6: prone, probability 0.000019
word 7: proove, probability 0.000000
word 8: prous, probability 0.000000
word 9: protos, probability 0.000000
word 10: rhose, probability 0.000000
word 11: grose, probability 0.000000
word 12: rose, probability 0.000016
word 13: prosc, probability 0.000000
word 14: prese, probability 0.000000
word 15: purpose, probability 0.000222
word 16: prope, probability 0.000000
word 17: promote, probability 0.000005
word 18: arose, probability 0.000004
word 19: posse, probability 0.000002
word 20: roese, probability 0.000000
word 21: prove, probability 0.000198
word 22: probe, probability 0.000050
word 23: praise, probability 0.000001
word 24: proces, probability 0.000001
word 25: prool, probability 0.000001
word 26: prose, probability 0.000001

# Gradio Interface

In [35]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [49]:
import gradio as gr

# Define helper functions for generating edits
def edit_one_letter(word):
    """Generate all edits that are one edit away from the given word."""
    letters = 'abcdefghijklmnopqrstuvwxyz'
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    deletes = [L + R[1:] for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
    inserts = [L + c + R for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edit_two_letters(word):
    """Generate all edits that are two edits away from the given word."""
    return set(e2 for e1 in edit_one_letter(word) for e2 in edit_one_letter(e1))

# Define the spelling correction function
def get_corrections(word, probs, vocab, verbose=False):
    """
    Get spelling suggestions for the input word.
    """
    # Generate suggestions from two edits
    suggestions = list(edit_two_letters(word).intersection(vocab))

    # Create a list of suggestions with probabilities
    n_best = [[s, probs.get(s, -1)] for s in suggestions]

    # Sort by probability in descending order
    n_best = sorted(n_best, key=lambda x: x[1], reverse=True)

    # Return the top 10 suggestions
    return n_best[:10]

# Sample data for testing
# Replace these with your actual vocab and probs
vocab = vocab
probs = probs

# Gradio interface
def spelling_corrections(word):
    corrections = get_corrections(word, probs, vocab)
    return corrections

# Create Gradio interface
interface = gr.Interface(
    fn=spelling_corrections,
    inputs=gr.Textbox(label="Enter a misspelled word"),
    outputs=gr.Dataframe(headers=["Word", "Probability"], label="Suggestions"),
    title="Spelling Correction",
    description="Enter a misspelled word to get spelling suggestions and their probabilities."
)

# Launch the interface
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e0a841727cb481826c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [50]:
probs

{'self': 0.00029331274307062326,
 'organization': 0.00012914516299378188,
 'associative': 0.00012403772716916903,
 'database': 0.00034000929918136924,
 'application': 0.0010251353333687206,
 'hisashi': 1.0944505338456092e-06,
 'suzuki': 4.012985290767233e-06,
 'suguru': 1.8240842230760152e-07,
 'arimoto': 1.4592673784608122e-06,
 'osaka': 1.8240842230760152e-06,
 'university': 0.0009498006549556812,
 'toyonaka': 1.8240842230760152e-07,
 'japan': 5.235121720228164e-05,
 'abstract': 0.0006552110529289047,
 'efficient': 0.0005785995155597121,
 'method': 0.003952790511405725,
 'organizing': 6.256608885150733e-05,
 'proposed': 0.0008294110962326641,
 'together': 0.00029240070095908527,
 'robot': 0.0003438398760498289,
 'eyesight': 5.472252669228046e-07,
 'system': 0.0030584420168315546,
 'associate': 6.293090569612252e-05,
 'input': 0.0034582812785298174,
 'output': 0.00201232971489746,
 'first': 0.002491151823454914,
 'half': 0.00020119648980528447,
 'part': 0.0008689937238734137,
 'discus