In [6]:
import tkinter as tk
from tkinter import ttk
import os
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
import math



# Create the main application window
root = tk.Tk()
root.title("Simple Search App")

# Create and place widgets
query_label = tk.Label(root, text="Enter your query:")
query_label.pack()

query_entry = tk.Entry(root)
query_entry.pack()

result_label = tk.Label(root, text="")
result_label.pack()

# Create Treeview widget
tree = ttk.Treeview(root, columns=('Documents', 'Rank Value'), show='headings')
tree.heading('Documents', text='Documents')
tree.heading('Rank Value', text='Rank Value')
tree.pack()


# Initialize inverted index
inverted_index = {}
N = 20
docs = [1,2,3,7,8,9,11,12,13,14,15,16,17,18,21,22,23,24,25,26]
# Initialize stop words, single alpha characters, and target characters
stop_words = {'a', 'is', 'the', 'of', 'all', 'and', 'to', 'can', 'be', 'as',
              'once', 'for', 'at', 'am', 'are', 'has', 'have', 'had', 'up', 'his',
              'her', 'in', 'on', 'no', 'we', 'do'}
single_alpha = set('abcdefghijklmnopqrstuvwxyz')
target_chars = ['%', '$', '*', "'", '’', '¨', '=', '+', '`', '/', '.', '·', ',', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '|', ':', '(', ')', '>', ';', '&', '“', '”', '[', ']', '@', '?', '}', '{']

# Function to preprocess text
def preprocess_text(text):
    for char in text:
        if char in target_chars:
            text = text.replace(char, ' ')
        elif char == '-' or char == '_':
            text = text.replace(char, '')
        elif char == '\n':
            text = text.replace(char, ' ')
    return text

# Function to tokenize and stem text
def tokenize_and_stem(text):
    porter_stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens if token.lower() not in stop_words and token not in single_alpha]
    return stemmed_tokens

# Function to calculate term frequency
def calculate_term_frequency(tokens):
    return dict(Counter(tokens))


def cal_tf_idf_and_making_DataFrame():

    directory = r'C:\Users\PC\Desktop\ResearchPapers\ResearchPapers' 
    #initializing DataFrame
    columns = ["words"]
    columns_ = [filename.split('.')[0] for filename in sorted(os.listdir(directory), key=lambda x: int(x[:-4]))]
    columns.extend(columns_)
    df = pd.DataFrame(columns = columns)


    for filename in sorted(os.listdir(directory),key=lambda x: int(x[:-4]) ):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):
            # Preprocess text
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
            print(f"Preprocessed : {filepath}")    
            preprocessed_content = preprocess_text(content)
    
            # Tokenize and stem
            print(f"Tokenize and stem : {filepath}") 
            stemmed_tokens = tokenize_and_stem(preprocessed_content)
    
            # Calculate term frequency
            print(f"Calculate term frequency : {filepath}") 
            term_freq = calculate_term_frequency(stemmed_tokens)
    
    
            for word , freq in term_freq.items():
                if word not in df['words'].values:
                    df = df._append({'words': word}, ignore_index=True)
                    df.loc[df['words'] == word, filename.split('.')[0]] = freq
                else:
                    df.loc[df['words'] == word, filename.split('.')[0]] = freq
    df = df.fillna(0)
    # calculating df
    df['df'] = (df.iloc[:, 1:22] != 0).sum(axis=1)
    # calculating idf
    df['idf'] = df['df'].apply(lambda x: math.log(N/x))
    # calculating tf-idf
    for doc in docs:
        df['tf_idf' + str(doc)] = df[str(doc)] * df['idf']
    # normalizing vectors 
    for doc in docs:
        df['Normalized tf_idf' + str(doc)] = df['tf_idf' + str(doc)]/math.sqrt((df['tf_idf' + str(doc)] ** 2).sum())
    print(df)
    df.to_csv(r'C:\Users\PC\Desktop\ResearchPapers\output.csv', index=False)




#cal_tf_idf_and_making_DataFrame()


def query_processing(query):
    global tree  # Access the globally defined 'tree' variable
    path = r'C:\Users\PC\Desktop\ResearchPapers\output.csv'
    df = pd.read_csv(path)

    pp_query = preprocess_text(query)
    stem_query = tokenize_and_stem(pp_query)
    term_freq = calculate_term_frequency(stem_query)

    for word , freq in term_freq.items():
        if word not in df['words'].values:   
            df = df._append({'words': word}, ignore_index=True)
            df.loc[df['words'] == word, filename.split('.')[0]] = freq
        else:
            df.loc[df['words'] == word, 'query'] = freq
             
    df = df.fillna(0)
    df['tf_idf_query'] = df['query'] * df['idf']
    # normalizing query
    df['Normalized tf_idf_query' ] = df['tf_idf_query']/math.sqrt((df['tf_idf_query'] ** 2).sum())
    #df.to_csv(r'C:\Users\PC\Desktop\ResearchPapers\output.csv', index=False)



    rank = []
    # ranking documents finding cosine similarity
    for doc in docs:
        rank.append(((df['Normalized tf_idf_query' ] * df['Normalized tf_idf' + str(doc)]).sum() , str(doc))) 


    print("")
    rank = sorted(rank, key=lambda x: x[0], reverse=True)
    for r , d in rank:
        if r >= 0.025:
            tree.insert('', 'end', values=("document-" + str(d), r))
    

# Function to display the table upon search button click
def show_table():
    global tree  # Access the globally defined 'tree' variable
    # Clear previous search results
    for child in tree.get_children():
        tree.delete(child)

    # Get the query from the entry widget
    query = query_entry.get()
    query_processing(query)

# Create search button
search_button = tk.Button(root, text="Search", command=show_table)
search_button.pack()

# Run the application
root.mainloop()
    


           



