In [2]:
pip install pandas sentence-transformers faiss-cpu transformers torch


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

# -----------------------
# STEP 1: Load EPL Data
# -----------------------
def load_epl_data(csv_path):
    df = pd.read_csv(csv_path)
    required_columns = ['date', 'team', 'opponent', 'gf', 'ga', 'venue']
    if not all(col in df.columns for col in required_columns):
        raise ValueError("Dataset is missing required columns.")
    
    df = df[df['venue'] == 'Home']
    summaries = [
        f"{row['team']} played against {row['opponent']} on {row['date']}. The final score was {row['gf']} - {row['ga']}."
        for _, row in df.iterrows()
    ]
    return summaries

# -----------------------
# STEP 2: Embed and Index
# -----------------------
def create_index(summaries, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(summaries)
    index = faiss.IndexFlatL2(embeddings[0].shape[0])
    index.add(np.array(embeddings))
    return index, model

# -----------------------
# STEP 3: Load Open Hugging Face LLM
# -----------------------
def load_llm():
    model_name = "tiiuae/falcon-7b-instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=300,
        temperature=0.7,
        device_map="auto"
    )

# -----------------------
# STEP 4: Query Function
# -----------------------
def query_rag(question, index, summaries, embed_model, llm_pipe):
    q_vec = embed_model.encode([question])
    D, I = index.search(np.array(q_vec), k=3)
    context = "\n".join([summaries[i] for i in I[0]])
    prompt = f"""You are a football expert assistant. Use the following match summaries to answer the question.

Context:
{context}

Question:
{question}

Answer:"""
    response = llm_pipe(prompt)[0]["generated_text"]
    # Extract answer after 'Answer:'
    return response.split("Answer:")[-1].strip()

# -----------------------
# STEP 5: Main App
# -----------------------
def main():
    csv_path = "dataset/matches.csv"  # Update with your actual path
    summaries = load_epl_data(csv_path)
    index, embed_model = create_index(summaries)
    llm_pipe = load_llm()
    print("\n✅ Ready to answer EPL questions!")

    while True:
        question = input("\n❓ Your question (or 'exit' to quit): ")
        if question.lower() == "exit":
            break
        answer = query_rag(question, index, summaries, embed_model, llm_pipe)
        print(f"\n🧠 Answer: {answer}")

if __name__ == "__main__":
    main()