In [None]:
# Install required packages
!pip install nltk scikit-learn pandas numpy joblib contractions beautifulsoup4 inflect

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import re
import unicodedata
import logging
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.lemma import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from bs4 import BeautifulSoup
import contractions
import inflect
import joblib
from google.colab import files
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Text Processing Class
class TextProcessor:
    def __init__(self):
        self.tokenizer = nltk.tokenize.word_tokenize
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.inflect_engine = inflect.engine()

    def clean_text(self, text, words_to_remove):
        words = text.split()
        cleaned_words = [word for word in words if word not in words_to_remove]
        cleaned_text = ' '.join(cleaned_words)
        return cleaned_text

    def number_to_words(self, text):
        words = self.tokenizer(text)
        converted_words = []
        for word in words:
            if word.replace('.', '', 1).isdigit():
                converted_words.append(word)
            else:
                if word.isdigit():
                    try:
                        num = int(word)
                        if num <= 999999999999999:
                            converted_word = self.inflect_engine.number_to_words(word)
                            converted_words.append(converted_word)
                        else:
                            converted_words.append('[Number Out of Range]')
                    except:
                        converted_words.append('[Number Out of Range]')
                else:
                    converted_words.append(word)
        return ' '.join(converted_words)

    def remove_html_tags(self, text):
        try:
            if '<' in text and '>' in text:
                return BeautifulSoup(text, 'html.parser').get_text()
            else:
                return text
        except:
            logging.warning('MarkupResemblesLocatorWarning: The input looks more like a filename than markup.')
            return text

    def normalize_unicode(self, text):
        return unicodedata.normalize('NFKD', text)

    def expand_contractions(self, text):
        return contractions.fix(text)

    def cleaned_text(self, text):
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text

    def normalization_example(self, text):
        return text.lower()

    def stemming_example(self, text):
        words = self.tokenizer(text)
        stemmed_words = [self.stemmer.stem(word) for word in words]
        return ' '.join(stemmed_words)

    def lemmatization_example(self, text):
        words = self.tokenizer(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def remove_stopwords(self, text):
        words = self.tokenizer(text)
        filtered_words = [word for word in words if word.lower() not in self.stop_words]
        return ' '.join(filtered_words)

    def remove_punctuation(self, text):
        return re.sub(r'[^\w\s]', '', text)

    def remove_urls(self, text):
        return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    def remove_special_characters_and_emojis(self, text):
        return re.sub(r'[^A-Za-z0-9\s]+', '', text)

    def replace_synonyms(self, text):
        words = self.tokenizer(text)
        synonym_words = [self.get_synonym(word) for word in words]
        return ' '.join(synonym_words)

    def get_synonym(self, word):
        synonyms = nltk.corpus.wordnet.synsets(word)
        if synonyms:
            return synonyms[0].lemmas()[0].name()
        return word

    def handle_negations(self, text):
        words = self.tokenizer(text)
        negated_text = []
        negate = False
        for word in words:
            if word.lower() in ['not', "n't"]:
                negate = True
            elif negate:
                negated_text.append(f'NOT_{word}')
                negate = False
            else:
                negated_text.append(word)
        return ' '.join(negated_text)

    def remove_non_english_words(self, text):
        words = self.tokenizer(text)
        english_words = [word for word in words if wordnet.synsets(word)]
        return ' '.join(english_words)


In [None]:
# Initialize text processor
processor = TextProcessor()

# Custom text processing function
def processed_text(text):
    if text is None:
        return text
    text = processor.cleaned_text(text)
    text = processor.normalization_example(text)
    text = processor.stemming_example(text)
    text = processor.lemmatization_example(text)
    text = processor.remove_stopwords(text)
    text = processor.number_to_words(text)
    text = processor.remove_punctuation(text)
    text = processor.expand_contractions(text)
    text = processor.normalize_unicode(text)
    text = processor.handle_negations(text)
    text = processor.remove_urls(text)
    return text


In [None]:
# Load the dataset from Google Drive
# Update this path to match your dataset location
dataset_path = '/content/drive/MyDrive/antique_dataset/antique_dataset.tsv'

# Read the TSV file
df = pd.read_csv(dataset_path, sep='\t')
print(f'Dataset shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}')
print(df.head())


In [None]:
# Identify text columns (adjust based on your dataset structure)
# Common column names for text data
text_columns = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['text', 'content', 'description', 'title', 'body', 'document']):
        text_columns.append(col)

print(f'Identified text columns: {text_columns}')

# If no text columns found automatically, specify manually
if not text_columns:
    # Manually specify your text column name here
    text_columns = ['text']  # Replace with your actual column name
    print(f'Using manual text columns: {text_columns}')


In [None]:
# Combine text columns if multiple exist
if len(text_columns) > 1:
    df['combined_text'] = df[text_columns].fillna('').astype(str).apply(lambda x: ' '.join(x), axis=1)
    text_column = 'combined_text'
else:
    text_column = text_columns[0]

# Remove null values
documents = df[text_column].dropna().astype(str).tolist()
print(f'Number of documents: {len(documents)}')
print(f'Sample document: {documents[0][:200]}...')


In [None]:
# Apply custom text processing to all documents
print('Processing documents with custom cleaning method...')
processed_documents = []

for i, doc in enumerate(documents):
    if i % 1000 == 0:
        print(f'Processed {i}/{len(documents)} documents')
    
    processed_doc = processed_text(doc)
    processed_documents.append(processed_doc)

print(f'Text processing complete. Sample processed document: {processed_documents[0][:200]}...')


In [None]:
# Create custom tokenizer that applies our text processing
def custom_tokenizer(text):
    """Custom tokenizer using our processed_text function"""
    # Apply our custom text processing
    processed = processed_text(text)
    # Tokenize the processed text
    tokens = processor.tokenizer(processed) if processed else []
    return tokens

# Create TF-IDF vectorizer with custom tokenizer
tfidf_vectorizer = TfidfVectorizer(
    preprocessor=False,  # Disable built-in preprocessing
    tokenizer=custom_tokenizer,  # Use our custom tokenizer
    token_pattern=None,  # Disable default token pattern
    lowercase=False,     # We already handle lowercasing in custom processing
    stop_words=None,     # We already handle stopwords in custom processing
    max_features=10000,  # Limit vocabulary size
    min_df=2,           # Minimum document frequency
    max_df=0.95,        # Maximum document frequency
    ngram_range=(1, 2)   # Include unigrams and bigrams
)

print('Fitting TF-IDF vectorizer with custom tokenizer...')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)  # Use original documents, not processed ones
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')
print(f'Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}')


In [None]:
# Build inverted index from TF-IDF matrix
print('Building inverted index...')

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create inverted index
inverted_index = defaultdict(list)

# Convert sparse matrix to coordinate format for efficient iteration
coo_matrix = tfidf_matrix.tocoo()

for doc_idx, term_idx in zip(coo_matrix.row, coo_matrix.col):
    term = feature_names[term_idx]
    tfidf_score = coo_matrix.data[coo_matrix.row == doc_idx][coo_matrix.col[coo_matrix.row == doc_idx] == term_idx][0]
    inverted_index[term].append((doc_idx, tfidf_score))

# Sort document lists by TF-IDF score (descending)
for term in inverted_index:
    inverted_index[term].sort(key=lambda x: x[1], reverse=True)

print(f'Inverted index created with {len(inverted_index)} terms')
print(f'Sample term: {list(inverted_index.keys())[0]} -> {inverted_index[list(inverted_index.keys())[0]][:5]}')


In [None]:
# Calculate document similarity matrix
print('Calculating document similarity matrix...')

# Calculate cosine similarity between all documents
similarity_matrix = cosine_similarity(tfidf_matrix)
print(f'Similarity matrix shape: {similarity_matrix.shape}')

# Calculate statistics
mean_similarity = np.mean(similarity_matrix)
max_similarity = np.max(similarity_matrix)
min_similarity = np.min(similarity_matrix)

print(f'Mean similarity: {mean_similarity:.4f}')
print(f'Max similarity: {max_similarity:.4f}')
print(f'Min similarity: {min_similarity:.4f}')


In [None]:
# Calculate MAP (Mean Average Precision) for IR evaluation
print('Calculating MAP score...')

def calculate_map(similarity_matrix, threshold=0.4):
    """Calculate MAP score for document retrieval"""
    map_scores = []
    n_docs = similarity_matrix.shape[0]
    
    for i in range(n_docs):
        # Get similarity scores for document i (excluding self)
        similarities = similarity_matrix[i].copy()
        similarities[i] = 0  # Exclude self-similarity
        
        # Sort documents by similarity (descending)
        sorted_indices = np.argsort(similarities)[::-1]
        sorted_similarities = similarities[sorted_indices]
        
        # Calculate precision at each relevant document
        relevant_docs = sorted_similarities >= threshold
        if np.sum(relevant_docs) == 0:
            map_scores.append(0.0)
            continue
        
        precisions = []
        num_relevant = 0
        
        for j, is_relevant in enumerate(relevant_docs):
            if is_relevant:
                num_relevant += 1
                precision = num_relevant / (j + 1)
                precisions.append(precision)
        
        if precisions:
            average_precision = np.mean(precisions)
            map_scores.append(average_precision)
        else:
            map_scores.append(0.0)
    
    return np.mean(map_scores)

# Calculate MAP with threshold 0.4
map_score = calculate_map(similarity_matrix, threshold=0.4)
print(f'MAP score (threshold=0.4): {map_score:.4f}')

# Try different thresholds if MAP is too low
thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
print('\nMAP scores at different thresholds:')
for thresh in thresholds:
    map_val = calculate_map(similarity_matrix, threshold=thresh)
    print(f'Threshold {thresh}: MAP = {map_val:.4f}')


In [None]:
# Create document-term matrix for additional analysis
print('Creating document-term matrix...')

# Convert to dense matrix for easier manipulation (only if not too large)
if tfidf_matrix.shape[0] * tfidf_matrix.shape[1] < 10000000:  # Limit to avoid memory issues
    doc_term_matrix = tfidf_matrix.toarray()
    print(f'Document-term matrix shape: {doc_term_matrix.shape}')
else:
    doc_term_matrix = tfidf_matrix  # Keep as sparse matrix
    print(f'Document-term matrix shape (sparse): {doc_term_matrix.shape}')

# Calculate term frequencies
term_frequencies = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
term_freq_dict = dict(zip(feature_names, term_frequencies))

# Get top terms
top_terms = sorted(term_freq_dict.items(), key=lambda x: x[1], reverse=True)[:20]
print('\nTop 20 terms by frequency:')
for term, freq in top_terms:
    print(f'{term}: {freq:.4f}')


In [None]:
# Save all components using joblib
print('Saving all components...')

# Save TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')
print('TF-IDF vectorizer saved')

# Save TF-IDF matrix
joblib.dump(tfidf_matrix, 'tfidf_matrix.joblib')
print('TF-IDF matrix saved')

# Save similarity matrix
joblib.dump(similarity_matrix, 'similarity_matrix.joblib')
print('Similarity matrix saved')

# Save inverted index
joblib.dump(dict(inverted_index), 'inverted_index.joblib')
print('Inverted index saved')

# Save document-term matrix
joblib.dump(doc_term_matrix, 'doc_term_matrix.joblib')
print('Document-term matrix saved')

# Save feature names
joblib.dump(feature_names, 'feature_names.joblib')
print('Feature names saved')

# Save processed documents
joblib.dump(processed_documents, 'processed_documents.joblib')
print('Processed documents saved')

# Save evaluation metrics
metrics = {
    'map_score': map_score,
    'mean_similarity': mean_similarity,
    'max_similarity': max_similarity,
    'min_similarity': min_similarity,
    'vocab_size': len(tfidf_vectorizer.vocabulary_),
    'num_documents': len(documents),
    'matrix_shape': tfidf_matrix.shape
}
joblib.dump(metrics, 'ir_metrics.joblib')
print('IR metrics saved')

print('All components saved successfully!')


In [None]:
# Download all files
print('Downloading files...')

files_to_download = [
    'tfidf_vectorizer.joblib',
    'tfidf_matrix.joblib',
    'similarity_matrix.joblib',
    'inverted_index.joblib',
    'doc_term_matrix.joblib',
    'feature_names.joblib',
    'processed_documents.joblib',
    'ir_metrics.joblib'
]

for file_name in files_to_download:
    try:
        files.download(file_name)
        print(f'Downloaded: {file_name}')
    except Exception as e:
        print(f'Error downloading {file_name}: {e}')

print('Download complete!')


In [None]:
# Summary and validation
print('=== SUMMARY ==>')
print(f'Documents processed: {len(documents)}')
print(f'Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}')
print(f'TF-IDF matrix shape: {tfidf_matrix.shape}')
print(f'Inverted index terms: {len(inverted_index)}')
print(f'MAP score: {map_score:.4f}')
print(f'Mean document similarity: {mean_similarity:.4f}')

if map_score >= 0.4:
    print('✅ MAP score requirement met (>= 0.4)')
else:
    print('❌ MAP score below 0.4. Consider adjusting parameters.')

print('\n=== USAGE EXAMPLE ==>')
print('To load the saved components:')
print('import joblib')
print('vectorizer = joblib.load("tfidf_vectorizer.joblib")')
print('matrix = joblib.load("tfidf_matrix.joblib")')
print('inverted_index = joblib.load("inverted_index.joblib")')
print('similarity_matrix = joblib.load("similarity_matrix.joblib")')
