## Import Libraries

In [None]:
import FileManager
import WordCleaner
import Indexer
%load_ext autoreload
%autoreload 2

## Load Files

In [None]:
dataset = FileManager.csvToDict('testing.csv')

## Remove stop words

In [None]:
filtered_dataset = {}
filtered_file_writer, file = FileManager.openCSVWriter('filtered.csv',['id','text'])
for row in dataset:
    filtered_dataset[row] = WordCleaner.removeStopWords(dataset[row])
    filtered_file_writer.writerow({'id': row, 'text': filtered_dataset[row]})

## Stem

In [None]:
filtered_stemmed_dataset = {}
# To clear the contents of the file
stemmed_file_writer, file = FileManager.openCSVWriter('stemmed.csv',['id','text'])
for row in filtered_dataset:
    filtered_stemmed_dataset[row] = WordCleaner.stem(filtered_dataset[row], 'porter')
    stemmed_file_writer.writerow({'id': row, 'text': filtered_stemmed_dataset[row]})

## Lemmatize

In [None]:
filtered_lemmatized_dataset = {}
lemmatized_file_writer, file = FileManager.openCSVWriter('lemmatized.csv',['id','text'])
for row in filtered_dataset:
    filtered_lemmatized_dataset[row] = WordCleaner.lemmatize(filtered_dataset[row])
    lemmatized_file_writer.writerow({'id': row, 'text': filtered_lemmatized_dataset[row]})

## Creating the inverted index

In [None]:
stemmed_inverted_index = Indexer.getInvertedIndex(filtered_stemmed_dataset)
stemmed_inverted_index

In [None]:
lemmatized_inverted_index = Indexer.getInvertedIndex(filtered_lemmatized_dataset)
lemmatized_inverted_index

## Calculating tf-idf for the document

In [None]:
(tfidf_matrix, df_stem) = Indexer.calculateTF_IDF(filtered_stemmed_dataset)
df_stem

In [None]:
(tfidf_matrix, df_lemma) = Indexer.calculateTF_IDF(filtered_lemmatized_dataset)
df_lemma

### Count words in file

In [None]:
# import pandas as pd

# # Read the CSV file (replace 'your_file.csv' with the actual filename)
# df = pd.read_csv('lemmatized.csv')

# # Assuming you want to count words in the 'text' column
# text_column = df['text']

# # Tokenize and count words
# total_words = sum(len(text.split()) for text in text_column)

# print(f"Total words in the CSV file: {total_words}")


### Spell Check

In [None]:
# !pip install textblob

In [None]:
# from textblob import TextBlob

# # word = "henlo cmputr"  # Incorrect spelling
# # print("Original text:", word)
# # corrected_word = TextBlob(word).correct()
# # print("Corrected text:", corrected_word)

# corrected_dataset = {}
# stemmed_file_writer, file = FileManager.openCSVWriter('stemmed.csv',['id','text'])
# for row in filtered_stemmed_dataset:
#     # Correct spelling using TextBlob
#     corrected_dataset[row] = str(TextBlob(str(filtered_stemmed_dataset[row])).correct())
#     stemmed_file_writer.writerow({'id': row, 'text': corrected_dataset[row]})


### Synonyms Mapping

In [None]:
# synonyms_mapping = {
#     "USA": "United States of America",
#     "U.S.": "United States of America",
#     "NYC": "New York City",
#     "1st": "first",
#     "england": "britain"
#     # Add more mappings as needed
# }


# def normalize_term(term):
#     return synonyms_mapping.get(term, term)  # Use the canonical form if available, else keep the original term

# # Usage:
# user_input = "NYC weather forecast"
# normalized_input = " ".join(normalize_term(term) for term in user_input.split())
# print(normalized_input)  # Output: "New York City weather forecast"


## Cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

vector_A = df_stem.iloc[0]  # First vector (document 1)

# Convert vector A to a numpy array
A_array = np.array(vector_A)
related_docs = []
for index, row in df_stem.iterrows():
    B_array = np.array(row)  # Access the row data (vector B)
    similarity = cosine_similarity([A_array], [B_array])[0][0]
    print(f"Cosine Similarity between doc 1 and doc {index}: {similarity:.4f}")
    if(similarity > 0.5):
        related_docs.append((index, similarity))

# Sort related_docs by similarity (highest to lowest)
related_docs.sort(key=lambda x: x[1], reverse=True)

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

vector_A = df_lemma.iloc[0]  # First vector (document 1)

# Convert vector A to a numpy array
A_array = np.array(vector_A)
related_docs = []
for index, row in df_lemma.iterrows():
    B_array = np.array(row)  # Access the row data (vector B)
    similarity = cosine_similarity([A_array], [B_array])[0][0]
    print(f"Cosine Similarity between doc 1 and doc {index}: {similarity:.4f}")
    if(similarity > 0.5):
        related_docs.append((index, similarity))

# Sort related_docs by similarity (highest to lowest)
related_docs.sort(key=lambda x: x[1], reverse=True)

# Print the sorted related documents
print("Related Docs (similarity > 0.5):")
for doc, sim in related_docs:
    print(f"Doc {doc}: Similarity = {sim:.4f}")

## Testing stuff

In [None]:
# BM25 structure
# <query_id> Q0 <doc_id> <rank> <BM25_score> BM25

In [None]:
# !pip install rank-bm25

In [None]:
# import pandas as pd
# from rank_bm25 import BM25Okapi

# # Example corpus (list of documents from the DataFrame)
# corpus = df_lemma.values.tolist()

# # Initialize BM25 model
# bm25 = BM25Okapi(corpus)

# # Example query
# query = "Introduction Machine Learning Algorithms"
# tokenized_query = query.split()
# print(tokenized_query)

# # Get BM25 scores for documents
# doc_scores = bm25.get_scores(tokenized_query)

# # Write results to BM25.res file
# with open("BM25.res", "w") as f:
#     for rank, (doc_id, score) in enumerate(zip(range(len(doc_scores)), doc_scores)):
#         f.write(f"158491 Q0 {doc_id} {rank} {score:.6f} BM25\n")

