## Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
import Matcher
from sklearn.feature_extraction.text import TfidfVectorizer
%load_ext autoreload
%autoreload 2

from nltk.corpus import wordnet
import numpy as np

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

## Load Files

In [None]:
vectorizer = TfidfVectorizer()
dataset = FileManager.csv_to_dict('./csv/testing.csv')
datasets = [dataset]

## Remove stop words

In [None]:
filtered_file_writer, file = FileManager.open_csv_writer('./csv/filtered.csv',['id','text'])
filtered_dataset = {}
for row in dataset:
    filtered_dataset[row] = WordCleaner.remove_stop_words(dataset[row])
    filtered_file_writer.writerow({'id': row, 'text': filtered_dataset[row]})
datasets.append(filtered_dataset)

## Stem

In [None]:
dataset = datasets[-1]
stemmed_file_writer, file = FileManager.open_csv_writer('./csv/stemmed.csv',['id','text'])
filtered_stemmed_dataset = {}
# To clear the contents of the file
for row in dataset:
    filtered_stemmed_dataset[row] = WordCleaner.stem(dataset[row], 'Snowball')
    stemmed_file_writer.writerow({'id': row, 'text': filtered_stemmed_dataset[row]})
datasets.append(filtered_stemmed_dataset)

## Lemmatize

In [None]:
dataset = datasets[-1]
lemmatized_file_writer, file = FileManager.open_csv_writer('./csv/lemmatized.csv',['id','text'])
filtered_lemmatized_dataset = {}
for row in dataset:
    filtered_lemmatized_dataset[row] = WordCleaner.lemmatize(dataset[row])
    lemmatized_file_writer.writerow({'id': row, 'text': filtered_lemmatized_dataset[row]})
datasets.append(filtered_lemmatized_dataset)

## Synonym Mapping

In [None]:
dataset = datasets[-1]
# Build dictionary for the dataset
synonym_dict = {}
for key, words in dataset.items():
    for word in words:
        synonym_dict[word] = WordCleaner.get_unified_synonym(word)
print(synonym_dict)

# Update the dataset with alternative words
mapped_dataset = {}
for key, words in dataset.items():
    mapped_dataset[key] = [synonym_dict[word] for word in words]
datasets.append(mapped_dataset)
print(mapped_dataset)

In [None]:
mapped_file_writer, file = FileManager.open_csv_writer('./csv/mapped.csv',['id','text'])
for row in mapped_dataset:
    mapped_file_writer.writerow({'id': row, 'text': mapped_dataset[row]})

## Creating the inverted index

In [None]:
inverted_index = Indexer.get_inverted_index(datasets[-1])
inverted_index

## Calculating tf-idf for the document

In [None]:
# (tfidf_matrix, df) = Indexer.calculateManualTF_IDF(datasets[-1])
# df
df = Indexer.calculate_tf_idf(datasets[-1], vectorizer)
df

## Query Manipulation 

### Process query

In [None]:
query = 'US'
query = word_tokenize(query)
print(query)
query = WordCleaner.remove_stop_words(query)
print(query)
query = WordCleaner.stem(query, 'porter')
print(query)
# query = WordCleaner.lemmatize(query)
# print(query)
query = [WordCleaner.get_alternative(word) for word in query]
print(query)

### Calculate TF-IDF

In [None]:
# all_tokens = []
# for row in datasets[-1]:
#     for token in datasets[-1][row]:
#         if token not in all_tokens:
#             all_tokens.append(token)
                
# (query_tfidf_matrix, qdf) = Indexer.calculate_doc_tf_idf(datasets[-1],all_tokens,query)
# qdf

qdf = Indexer.calculate_doc_tf_idf([' '.join(query)],vectorizer)
qdf

### Calculate Cos Sim

In [None]:
# merge with redwan's work

related_docs = Matcher.calcCosSimWithCorpus(df,qdf.iloc[0])

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

## Tests

### Spell Check

In [None]:
# !pip install textblob

In [None]:
# from textblob import TextBlob

# # word = "henlo cmputr"  # Incorrect spelling
# # print("Original text:", word)
# # corrected_word = TextBlob(word).correct()
# # print("Corrected text:", corrected_word)

# corrected_dataset = {}
# stemmed_file_writer, file = FileManager.openCSVWriter('./csv/stemmed.csv',['id','text'])
# for row in filtered_stemmed_dataset:
#     # Correct spelling using TextBlob
#     corrected_dataset[row] = str(TextBlob(str(filtered_stemmed_dataset[row])).correct())
#     stemmed_file_writer.writerow({'id': row, 'text': corrected_dataset[row]})


### Synonyms Mapping Tests

In [None]:
# Example: Get synonyms for the word 'small'
for synset in wordnet.synsets('machine'):
    print(synset.name())
    print(synset.lemma_names())
    
sys = wordnet.synsets('car')[0].hypernyms()
print(sys[0].name)

In [None]:
def get_synonyms(word, pos):
    synonyms = set()
    for synset in wordnet.synsets(word, pos):
        synonyms.update(synset.lemma_names())
    return synonyms

# Example usage:
synonym_dict = {
    "car": list(get_synonyms("car", "n")),
    "phone": list(get_synonyms("phone", "n")),
    # Add more words and their synonyms here...
}

print(synonym_dict)


### BM25

In [None]:
# BM25 structure
# <query_id> Q0 <doc_id> <rank> <BM25_score> BM25

In [None]:
# !pip install rank-bm25

In [None]:
# import pandas as pd
# from rank_bm25 import BM25Okapi

# # Example corpus (list of documents from the DataFrame)
# corpus = df_lemma.values.tolist()

# # Initialize BM25 model
# bm25 = BM25Okapi(corpus)

# # Example query
# query = "Introduction Machine Learning Algorithms"
# tokenized_query = query.split()
# print(tokenized_query)

# # Get BM25 scores for documents
# doc_scores = bm25.get_scores(tokenized_query)

# # Write results to BM25.res file
# with open("BM25.res", "w") as f:
#     for rank, (doc_id, score) in enumerate(zip(range(len(doc_scores)), doc_scores)):
#         f.write(f"158491 Q0 {doc_id} {rank} {score:.6f} BM25\n")



### Count words in file

In [None]:
# import pandas as pd

# # Read the CSV file (replace 'your_file.csv' with the actual filename)
# df = pd.read_csv('lemmatized.csv')

# # Assuming you want to count words in the 'text' column
# text_column = df['text']

# # Tokenize and count words
# total_words = sum(len(text.split()) for text in text_column)

# print(f"Total words in the CSV file: {total_words}")
