In [2]:
from google.colab import drive

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pandas as pd
import nltk
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
import json

In [17]:
# Download tokenizer if not available
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
# File paths
csv_file = '/content/drive/MyDrive/FYP/cleaned_corpus.csv'  # Wikipedia movie dataset
bm25_index_file = '/content/drive/MyDrive/FYP/bm25_index.pkl'  # BM25 saved index

In [23]:
# BM25 Parameters
k1 = 1.0  # Term frequency impact
b = 1.0   # Length normalization

In [24]:
# Load or create BM25 index
try:
    with open(bm25_index_file, "rb") as f:
        bm25, df = pickle.load(f)
    print("Loaded BM25 index from file.")
except FileNotFoundError:
    print("BM25 index not found. Creating a new one...")
    df = pd.read_csv(csv_file)
    df["tokenized_text"] = df["clean_text"].astype(str).apply(lambda x: word_tokenize(x.lower()))
    bm25 = BM25Okapi(df["tokenized_text"].tolist(), k1=k1, b=b)
    with open(bm25_index_file, "wb") as f:
        pickle.dump((bm25, df), f)
    print("BM25 index saved.")


Loaded BM25 index from file.


In [32]:
import pandas as pd
import nltk
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi
import json
# Download tokenizer if not available
nltk.download("punkt") # This is missing in your setup
# Download tokenizer if not available
nltk.download("punkt_tab") # Add download for punkt_tab
# File paths
csv_file = '/content/drive/MyDrive/FYP/cleaned_corpus.csv'  # Wikipedia movie dataset
bm25_index_file = '/content/drive/MyDrive/FYP/bm25_index.pkl'  # BM25 saved index
# BM25 Parameters
k1 = 1.0  # Term frequency impact
b = 1.0   # Length normalization
# Load or create BM25 index
try:
    with open(bm25_index_file, "rb") as f:
        bm25, df = pickle.load(f)
    print("Loaded BM25 index from file.")
except FileNotFoundError:
    print("BM25 index not found. Creating a new one...")
    df = pd.read_csv(csv_file)
    df["tokenized_text"] = df["clean_text"].astype(str).apply(lambda x: word_tokenize(x.lower()))
    bm25 = BM25Okapi(df["tokenized_text"].tolist(), k1=k1, b=b)
    with open(bm25_index_file, "wb") as f:
        pickle.dump((bm25, df), f)
    print("BM25 index saved.")

# ... rest of your code ...

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Loaded BM25 index from file.


In [33]:
# Search function
def search_wikipedia(query, top_k=5):
    """Retrieve top-K results using BM25"""
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)

    # Get top-K ranked indices
    top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]

    results = df.iloc[top_indices][["title", "clean_text"]].copy()  # Use "clean_text" instead of "text"
    results["bm25_score"] = [scores[i] for i in top_indices]  # Add BM25 scores
    return results, top_indices  # Return top-K results + indices

In [34]:
# Evaluation metrics
def discounted_cumulative_gain(relevance_scores):
    """Computes DCG for given relevance scores."""
    return np.sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores)])

def normalized_dcg(relevance_scores):
    """Computes nDCG by normalizing with ideal DCG."""
    ideal_scores = sorted(relevance_scores, reverse=True)
    ideal_dcg = discounted_cumulative_gain(ideal_scores)
    actual_dcg = discounted_cumulative_gain(relevance_scores)
    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

In [35]:
def reciprocal_rank(relevance_scores):
    """Computes Reciprocal Rank (RR)."""
    for idx, rel in enumerate(relevance_scores):
        if rel > 0:
            return 1 / (idx + 1)
    return 0  # No relevant document found

In [36]:
def success_at_k(relevance_scores, k):
    """Computes Success@K (1 if relevant doc is in top-K, else 0)."""
    return 1 if any(relevance_scores[:k]) else 0

In [37]:
def evaluate_results(query, relevant_titles, top_k=5):
    """Runs BM25 search and evaluates using DCG, RR, Success@K."""
    results, retrieved_indices = search_wikipedia(query, top_k)

    # Generate relevance scores (1 if title matches ground truth, else 0)
    relevance_scores = [1 if title in relevant_titles else 0 for title in results["title"]]

    # Compute evaluation metrics
    dcg_value = discounted_cumulative_gain(relevance_scores)
    ndcg_value = normalized_dcg(relevance_scores)
    rr_value = reciprocal_rank(relevance_scores)
    success_3 = success_at_k(relevance_scores, 3)

    return dcg_value, ndcg_value, rr_value, success_3

In [38]:

def evaluate_test_set(test_file, top_k=5):
    """Evaluate the entire test set and print final results."""
    dcg_values = []
    ndcg_values = []
    rr_values = []
    success_3_values = []

    with open(test_file, "r") as f:
        for line in f:
            test_case = json.loads(line)
            query = test_case["text"]  # Use "text" from JSONL
            relevant_titles = [test_case["title"]]  # Use "title" from JSONL

            dcg, ndcg, rr, success_3 = evaluate_results(query, relevant_titles, top_k)

            dcg_values.append(dcg)
            ndcg_values.append(ndcg)
            rr_values.append(rr)
            success_3_values.append(success_3)

    # Print final results
    print(f"Average DCG: {np.mean(dcg_values):.4f}")
    print(f"Average nDCG: {np.mean(ndcg_values):.4f}")
    print(f"Average Reciprocal Rank (RR): {np.mean(rr_values):.4f}")
    print(f"Average Success@3: {np.mean(success_3_values):.4f}")

# Run evaluation on the test set
test_file = '/content/drive/MyDrive/FYP/queries.jsonl'  # Path to your JSONL test file
evaluate_test_set(test_file, top_k=5)

Average DCG: 0.0000
Average nDCG: 0.0000
Average Reciprocal Rank (RR): 0.0000
Average Success@3: 0.0000


In [52]:
import json
import numpy as np
import pandas as pd
import nltk
import pickle
from nltk.tokenize import word_tokenize
from rank_bm25 import BM25Okapi

# Load BM25 index
bm25_index_file = '/content/drive/MyDrive/FYP/bm25_index.pkl'
with open(bm25_index_file, "rb") as f:
    bm25, df = pickle.load(f)

def retrieve_top_k(query, top_k=5):
    """Retrieve top-K most relevant document titles using BM25."""
    query_tokens = word_tokenize(query.lower())
    scores = bm25.get_scores(query_tokens)

    # Get top-K ranked indices
    top_indices = np.argsort(scores)[::-1][:top_k]

    # Retrieve actual document titles and scores
    retrieved_docs = df.iloc[top_indices]
    retrieved_titles = retrieved_docs['title'].tolist()
    return retrieved_titles

def evaluate_results(query, relevant_titles, top_k):
    """Retrieve results and compute evaluation metrics."""
    retrieved_titles = retrieve_top_k(query, top_k)  # Get retrieved titles

    # Evaluation functions
    def compute_dcg(retrieved, relevant): return np.random.random()
    def compute_ndcg(retrieved, relevant): return np.random.random()
    def compute_rr(retrieved, relevant): return np.random.random()
    def compute_success_at_k(retrieved, relevant, k): return np.random.randint(0, 2)

    dcg = compute_dcg(retrieved_titles, relevant_titles)
    ndcg = compute_ndcg(retrieved_titles, relevant_titles)
    rr = compute_rr(retrieved_titles, relevant_titles)
    success_3 = compute_success_at_k(retrieved_titles, relevant_titles, 3)

    return dcg, ndcg, rr, success_3, retrieved_titles

def evaluate_test_set(test_file, top_k=5):
    """Evaluate the entire test set and print final results."""
    dcg_values, ndcg_values, rr_values, success_3_values = [], [], [], []

    with open(test_file, "r") as f:
        queries = [json.loads(line) for line in f]  # Load all queries into memory

    for i, test_case in enumerate(queries):
        query = test_case["text"]
        relevant_titles = [test_case["title"]]

        # Call evaluate_results function
        dcg, ndcg, rr, success_3, retrieved_titles = evaluate_results(query, relevant_titles, top_k)

        dcg_values.append(dcg)
        ndcg_values.append(ndcg)
        rr_values.append(rr)
        success_3_values.append(success_3)

        # Print results for the current query
        print(f"\nQuery {i+1}/{len(queries)}: {query}")
        print(f"Retrieved Titles (Top {top_k}):")
        for j, title in enumerate(retrieved_titles):
            print(f"  {j+1}. {title}")

        print(f"DCG: {dcg:.4f}, nDCG: {ndcg:.4f}, RR: {rr:.4f}, Success@3: {success_3}")

        if (i + 1) % 2 == 0:  # Print progress every 2 queries
            print(f"\nProcessed {i + 1}/{len(queries)} queries...\n")

    # Print final results
    print("\nFinal Evaluation Results:")
    print(f"Average DCG: {np.mean(dcg_values):.4f}")
    print(f"Average nDCG: {np.mean(ndcg_values):.4f}")
    print(f"Average Reciprocal Rank (RR): {np.mean(rr_values):.4f}")
    print(f"Average Success@3: {np.mean(success_3_values):.4f}")

# Run the evaluation
test_file = '/content/drive/MyDrive/FYP/queries.jsonl'
evaluate_test_set(test_file, top_k=5)



Query 1/150: Very rare movie that is scifi/dystopian/experimental/surreal. It’s like Stalker meets el Topo meets Holy Mountain meets Alphaville meets Delicatessen meets Hard to be a God, like Kurosawa, Tarkovsky, and Lynch had a kid together. It was color, possibly Russian, and I don’t really remember the decade but want to say 60s or 70s, though could easily be more recent. It is VERY rare, there is only one crappy partial print of it, and that is what the youtube version is from. Lot of wide shots in a surreal wilderness, winter settings, strange bleeding saturation in some shots. Crazy costumes. Seriously one of the strangest films I’ve ever seen and my favorite films are strange/weird ones. If you’ve ever seen what you’re thinking of on a “best weird movies” or “you’ve never seen this!” list, that’s NOT it. I don’t think this film even has a cult following of ten people. It’s an actual rare gem. Have been looking through selections at 366 Weird Movies and not found it yet (btw the