# 8. Correct spellings in the query using edit distance

In [1]:
import pandas as pd
import os

In [19]:
class PositionalIndex:
    
    def __init__(self):
        self.index = {}
        self.terms = []
        self.docs = []
        
    def buildIndex(self):
        files = os.listdir('Cranfield Data Set')
        for i in range(20):
            with open(f"Cranfield Data Set/{files[i]}", "r") as f:
                text = f.read()
                text = text.lower()
                self.docs.append(text)
                text = text.split()
                self.terms.extend(text)
            self.terms = sorted(list(set(self.terms)))
            for j in range(len(text)):
                if text[j] in self.index:
                    if i in self.index[text[j]]:
                        self.index[text[j]][i][0] += 1
                        self.index[text[j]][i][1].append(j)
                    else:    
                        self.index[text[j]].update({i: [1, [j]]})
                else:
                    self.index[text[j]] = {i: [1, [j]]}

In [23]:
class Query:
    
    def __init__(self, query):
        self.query = query
        self.pos = PositionalIndex()
        self.pos.buildIndex()

    def spellingCorrection(self, word):
        minDist = 99999
        ans = []
        minDistMats = []
        for word2 in self.pos.terms:
            mat = [[i if j==0 else j if i==0 else 0 for j in range(len(word2)+1)] for i in range(len(word)+1)]
            for i in range(1, len(word)+1):
                for j in range(1, len(word2)+1):
                    mat[i][j] = min(mat[i-1][j-1]+(0 if word[i-1]==word2[j-1] else 1), mat[i-1][j]+1, mat[i][j-1]+1)
            if mat[len(word)][len(word2)] < minDist:
                minDist = mat[len(word)][len(word2)]
                ans = [word2]
                minDistMats = [mat]
            elif mat[len(word)][len(word2)] == minDist:
                minDist = mat[len(word)][len(word2)]
                ans.append(word2)
                minDistMats.append(mat)
        for i in range(len(ans)):
            df = pd.DataFrame(minDistMats[i], index=['']+[x for x in word], columns=['']+[x for x in ans[i]])
            print(df)
            prompt = input(f"You searched for {word}, did you mean: {ans[i]}? (Y/N): ")
            if prompt.lower() == 'y':
                return ans[i]
        print("Cannot find term!")
        return word2

    def preProcessQuery(self):
        self.query = self.query.split()
        for i in range(len(self.query)):
            if self.query[i] not in self.pos.index:
                self.query[i] = self.spellingCorrection(self.query[i])
        self.query = ' '.join(self.query)
    
    def processPositional(self):
        if self.query == '':
            return []
        self.preProcessQuery()
        queryPos = self.query.split()
        ans = []
        if queryPos[0] in self.pos.index:
            first = self.pos.index[queryPos[0]]
        else:
            return []
        for docId, vals in first.items():
            count = 0
            for i in range(1, len(queryPos)):
                if docId not in self.pos.index[queryPos[i]]:
                   break
                postings1 = vals[1]
                postings2 = self.pos.index[queryPos[i]][docId][1]
                for x in postings1:
                    for y in postings2:
                        if y < x:
                            continue
                        elif abs(x - y) == i:
                            count += 1
            if count > len(queryPos)-1:
                ans.append(docId)
        print("Documents retrieved:")
        for i in ans:
            print(self.pos.docs[i])
            print()

In [25]:
query = input("Enter the query: ")
q = Query(query)
q.processPositional()

Enter the query:  fluer of a buckled plata


      f  l  u  i  d
   0  1  2  3  4  5
f  1  0  1  2  3  4
l  2  1  0  1  2  3
u  3  2  1  0  1  2
e  4  3  2  1  1  2
r  5  4  3  2  2  2


You searched for fluer, did you mean: fluid? (Y/N):  N


      f  l  u  t  t  e  r
   0  1  2  3  4  5  6  7
f  1  0  1  2  3  4  5  6
l  2  1  0  1  2  3  4  5
u  3  2  1  0  1  2  3  4
e  4  3  2  1  1  2  2  3
r  5  4  3  2  2  2  3  2


You searched for fluer, did you mean: flutter? (Y/N):  Y


      p  l  a  t  e
   0  1  2  3  4  5
p  1  0  1  2  3  4
l  2  1  0  1  2  3
a  3  2  1  0  1  2
t  4  3  2  1  0  1
a  5  4  3  2  1  1


You searched for plata, did you mean: plate? (Y/N):  Y


Documents retrieved:
theory and experiments of the flutter of a buckled plate are discussed   it is shown that an increase in the initial deviation from flatness or a static pressure differential across the plate raises the critical value of the reduced velocity    the applicability of the galerkin method to the linearized problem of flutter of an unbuckled plate has been questioned by several authors   in this paper the flutter condition was formulated in the form of an integral equation and solved numerically by the method of iteration and the method of matrix approximations thus avoiding the constraint of assumed modes   for a plate (with finite bending rigidity) the results confirm those given by the galerkin method    an approximate analysis of the limiting form and amplitude of the flutter motion for a buckled plate is presented 

