In [None]:
!pip install faiss-cpu # Install faiss library

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
# prompt: write code to connect to the ngoogle drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import faiss
import pickle
from sentence_transformers import SentenceTransformer

In [None]:
df = pd.read_csv("/content/drive/MyDrive/filename.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,url,title,text
0,0,67282749,https://en.wikipedia.org/wiki/Bert%20Meek,Bert Meek,"Bert Bookham Meek (September 2, 1883 - Septemb..."
1,1,67282757,https://en.wikipedia.org/wiki/Pericastor,Pericastor,"In Greek mythology, Pericastor (Ancient Greek:..."
2,2,67282773,https://en.wikipedia.org/wiki/Dead%20Sea%20Museum,Dead Sea Museum,The Dead Sea Museum (Arabic: متحف البحر الميت)...
3,3,67282775,https://en.wikipedia.org/wiki/Massalongia%20rubra,Massalongia rubra,Massalongia rubra is a species of gall midge w...
4,4,67282780,https://en.wikipedia.org/wiki/Federica%20Ferraro,Federica Ferraro,Federica Ferraro (born 18 August 1988) is an I...


In [None]:
csv_file = "/content/drive/MyDrive/filename.csv"
faiss_index_file = "faiss_index.pkl"
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Small, efficient model # Hugging Face model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(embedding_model_name).to(device)

# Load or Create FAISS Index
try:
    with open(faiss_index_file, "rb") as f:
        faiss_index, df = pickle.load(f)
    print("Loaded precomputed FAISS index for 50K rows.")
except FileNotFoundError:
    print("FAISS index not found. Creating a new one...")

    # Load Wikipedia Movie Data
    df = pd.read_csv(csv_file)

    # Select the first 50,000 rows
    df = df.head(10000)

    # Encode movie descriptions and print progress
    movie_texts = df["text"].astype(str).tolist()
    embeddings = []

    for i, text in enumerate(movie_texts):
        print(f"Encoding document {i+1}/50000...")
        embedding = model.encode(text, convert_to_tensor=True).cpu().numpy()
        embeddings.append(embedding)

    embeddings = np.array(embeddings)  # Convert list to NumPy array

    # Create FAISS Index
    d = embeddings.shape[1]  # Embedding dimension
    faiss_index = faiss.IndexFlatL2(d)  # L2 (Euclidean) similarity search
    faiss_index.add(embeddings)  # Add 50K embeddings to the index

    # Save FAISS index
    with open(faiss_index_file, "wb") as f:
        pickle.dump((faiss_index, df), f)

    print("FAISS index for 50K rows saved.")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Encoding document 5002/50000...
Encoding document 5003/50000...
Encoding document 5004/50000...
Encoding document 5005/50000...
Encoding document 5006/50000...
Encoding document 5007/50000...
Encoding document 5008/50000...
Encoding document 5009/50000...
Encoding document 5010/50000...
Encoding document 5011/50000...
Encoding document 5012/50000...
Encoding document 5013/50000...
Encoding document 5014/50000...
Encoding document 5015/50000...
Encoding document 5016/50000...
Encoding document 5017/50000...
Encoding document 5018/50000...
Encoding document 5019/50000...
Encoding document 5020/50000...
Encoding document 5021/50000...
Encoding document 5022/50000...
Encoding document 5023/50000...
Encoding document 5024/50000...
Encoding document 5025/50000...
Encoding document 5026/50000...
Encoding document 5027/50000...
Encoding document 5028/50000...
Encoding document 5029/50000...
Encoding document 5030/50000...
Encodin

In [None]:
# ** Query Function **
def search_movies(query, top_k=5):
    """Searches Wikipedia movies using Hugging Face embeddings + FAISS."""

    # Encode query text into an embedding
    query_embedding = model.encode([query], convert_to_tensor=True).cpu().numpy()

    # Retrieve nearest neighbors
    _, indices = faiss_index.search(query_embedding, top_k)

    # Return top-K results
    return df.iloc[indices[0]][["title", "text"]]

# ** Run a Test Query **


In [None]:
query = "I'm trying to remember the name of this guy who's deeply involved with the LDS Church and also co-founded a major company. He served as a missionary in the Netherlands, earned a law degree from BYU, and co-founded Nu Skin Enterprises, where he was CEO. In 2020, he became the Young Men General President for the church. Who is he?"

results = search_movies(query, top_k=10)

print("\nTop-5 Similar Wikipedia Movies:")
print(results)


Top-5 Similar Wikipedia Movies:
                                        title  \
95                             Steven J. Lund   
3706                       Camille N. Johnson   
9401                          John J. Nichols   
4907                         Richard G. Moore   
1509                        Morgan Lyon Cotti   
8253                            Andrew Gebara   
354                          Peter J. Lambert   
1873                     Jørgen Watne Frydnes   
6930                           John T. Wilcox   
8912  List of Colorado School of Mines people   

                                                   text  
95    Steven J. Lund (born October 30, 1953) is an A...  
3706  Camille Neddo Johnson has been the 14th Primar...  
9401  John Joseph Nichols is a United States Air For...  
4907  Richard Garner Moore Jr. is a United States Ai...  
1509  Morgan Lyon Cotti is an American political sci...  
8253  Andrew J. Gebara is a United States Air Force ...  
354   Peter J. Lamber