In [ ]:
import re
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom text cleaning function
def custom_clean(text):
    # Example cleaning
    text = re.sub(r'\W', ' ', text) # Remove non-alphanumeric characters
    text = text.lower() # Convert to lowercase
    return text

# Custom tokenizer
class CustomTokenizer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [custom_clean(doc).split() for doc in X]

# Load data (replace with actual paths)
docs_path = "/content/drive/MyDrive/downloads/docs.csv"
queries_path = "/content/drive/MyDrive/downloads/queries.csv"
docs = pd.read_csv(docs_path)
queries = pd.read_csv(queries_path)

# Vectorization
vectorizer = TfidfVectorizer(tokenizer=CustomTokenizer(), preprocessor=lambda x: x)
X_docs = vectorizer.fit_transform(docs['text'])
X_queries = vectorizer.transform(queries['text'])

# Inverted index (stub, implement accordingly)
inverted_index = {
    # Populate with terms and indices
}

# Save the vectorizer, matrix, and indexes
joblib.dump(vectorizer, "/content/drive/MyDrive/downloads/tfidf_vectorizer.joblib")
joblib.dump(X_docs, "/content/drive/MyDrive/downloads/tfidf_docs.joblib")
joblib.dump(inverted_index, "/content/drive/MyDrive/downloads/inverted_index.joblib")

# Calculate and ensure MAP
def calculate_map(query_vectors, doc_vectors):
    # Implement the MAP calculation
    pass

map_score = calculate_map(X_queries, X_docs)
assert map_score >= 0.3, f"MAP score is too low: {map_score}"

print(f"MAP score: {map_score}")