In [1]:
import operator 

# 1. Preprocess function
def preprocess(sentence):
    # Convert the sentence to lowercase, split it into words, and create a set of words
    return set(sentence.strip().lower().split(" "))   

# 2. Indexing function
def indexing(file_name):
    file_tokens_pairs = []
    lines = open(file_name, "r", encoding="utf8").readlines() # Read the lines from a file with UTF-8 encoding
    for line in lines:  # Preprocess each line and append the token
        tokens = preprocess(line)
        file_tokens_pairs.append(tokens)
        #print(tokens)
    return file_tokens_pairs 

# 3. Calculate similarity function
def calc_similarity(preprocessed_query, preprocessed_sentences):
    score_dict = {}  # Initialize a dictionary to store similarity scores
    query_token_set = set(preprocessed_query) # Convert the preprocessed query into a set of tokens

    for i, sentence_tokens in enumerate(preprocessed_sentences):
        all_tokens = query_token_set | sentence_tokens  # Union of query tokens and sentence tokens
        same_tokens = query_token_set & sentence_tokens  # Intersection of query tokens and sentence tokens
        similarity = len(same_tokens) / len(all_tokens)  # Calculate similarity
        score_dict[i] = similarity  # Store the similarity score
    #print(score_dict)

    return score_dict  

# 1. Indexing
file_name = "jhe-koen-dev.en" 
file_tokens_pairs = indexing(file_name)  # Index the file and store tokenized lines in a list

# 2. Input the query
query = input("영어 쿼리를 입력하세요 : ")  
preprocessed_query = preprocess(query)  # Preprocess the query
#print(preprocessed_query)

# 3. Calculate similarities based on the same token set
score_dict = calc_similarity(preprocessed_query, file_tokens_pairs)  # Calculate similarities with the query

# 4. Sort the similarity list
sorted_score_list = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True) # Sort the similarity scores

# 5. Print the result
if sorted_score_list[0][1] == 0.0: # If the top similarity score is 0, print "no similar sentences"
    print("There is no similar sentence.")  
else:
    print("rank", "Index", "score", "sentence", sep="\t") # Print the header of result table
    rank = 1  # Initialize the rank counter
    for i, score in sorted_score_list[:10]: # Print the top 10 similar sentences
        print(rank, i, score, ' '.join(file_tokens_pairs[i]), sep="\t")  
        rank += 1  # Increment the rank counter


There is no similar sentence.
