# Installing required liabraries

In [18]:

!pip install -q langchain langchain-community chromadb tiktoken python-dotenv openai pandas


# Importing modules and LLM model

In [19]:
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Config
OPENROUTER_API_KEY = "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"  # your OpenRouter API key
OPENROUTER_MODEL = "nvidia/llama-3.1-nemotron-ultra-253b-v1:free"  # or another


# Loading and Processing Data


In [22]:
def load_and_process_data_from_csv():
    documents = []

    # 1. Player Info
    player_df = pd.read_csv('player_info.csv')
    for _, row in player_df.iterrows():
        content = f"{row['name']} ({row['team']}): {row['playingRole']} | Batting: {row['battingStyle']} | Bowling: {row['bowlingStyle']}"
        metadata = {
            "type": "player",
            "name": row['name'],
            "team": row['team'],
            "role": row['playingRole']
        }
        documents.append(Document(page_content=content, metadata=metadata))

    # 2. Batting Summary
    batting_df = pd.read_csv('batting_summary.csv')
    for _, row in batting_df.iterrows():
        opponent = row['match'].split('Vs ')[1]
        content = f"{row['batsmanName']} scored {row['runs']} runs ({row['balls']} balls, SR: {row['SR']}) for {row['teamInnings']} vs {opponent}"
        metadata = {
            "type": "batting",
            "player": row['batsmanName'],
            "match": row['match'],
            "runs": int(row['runs']),
            "team": row['teamInnings']
        }
        documents.append(Document(page_content=content, metadata=metadata))

    # 3. Bowling Summary
    bowling_df = pd.read_csv('bowling_summary.csv')
    for _, row in bowling_df.iterrows():
        opponent = row['match'].split('Vs ')[1]
        content = f"{row['bowlerName']} took {row['wickets']} wickets for {row['runs']} runs ({row['overs']} overs) for {row['bowlingTeam']} vs {opponent}"
        metadata = {
            "type": "bowling",
            "player": row['bowlerName'],
            "match": row['match'],
            "wickets": int(row['wickets']),
            "economy": float(row['economy']),
            "team": row['bowlingTeam']
        }
        documents.append(Document(page_content=content, metadata=metadata))

    # 4. Match Results
    results_df = pd.read_csv('match_results.csv')
    for _, row in results_df.iterrows():
        content = f"{row['team1']} vs {row['team2']}: {row['winner']} won by {row['margin']} on {row['matchDate']} at {row['ground']}"
        metadata = {
            "type": "match",
            "team1": row['team1'],
            "team2": row['team2'],
            "winner": row['winner'],
            "date": row['matchDate'],
            "ground": row['ground']
        }
        documents.append(Document(page_content=content, metadata=metadata))

    return documents


#Prepare and Vectorize Documents

In [23]:
# Load & split
documents = load_and_process_data_from_csv()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(documents)

# Create embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Store in Chroma
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="./t20_wc_db"
)


#Retriever

In [32]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 50}
)


# Prompting and RAG CHAIN

In [33]:
llm = ChatOpenAI(
    model_name=OPENROUTER_MODEL,
    temperature=0.3,
    max_tokens=1000,
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

prompt = ChatPromptTemplate.from_template(
    """You are a cricket expert analyzing the 2022 T20 World Cup.
Use only the context below to answer:

{context}

Question: {question}

Answer with these rules:
1. Be specific with numbers (e.g., "scored 82 runs in 53 balls")
2. Mention teams and dates when available, if not available, say "not mentioned"
3. If unsure, say "I couldn't find that in the tournament data"
4. For player comparisons, show stats side-by-side"""
)

# Final RAG Chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


# ChatBot

In [35]:
def chat():
    print("🏏 T20 World Cup 2022 Chatbot (type 'quit' to exit)\n")
    while True:
        try:
            user_input = input("Your question: ")
            if user_input.lower() == 'quit':
                break
            response = rag_chain.invoke(user_input)
            print(f"\nBot: {response}\n")
        except Exception as e:
            print(f"Error: {str(e)}")

chat()


🏏 T20 World Cup 2022 Chatbot (type 'quit' to exit)

Your question: how many matches india won in the tournamnent

Bot: Based on the provided context, India won the following matches in the 2022 T20 World Cup:

1. **India vs Pakistan** on **Oct 23, 2022** (won by 4 wickets)
2. **India vs Netherlands** on **Oct 27, 2022** (won by 56 runs)
3. **Bangladesh vs India** on **Nov 2, 2022** (won by 5 runs)
4. **India vs Zimbabwe** on **Nov 6, 2022** (won by 71 runs)

**Total matches won by India: 4**

Your question: how much virat scored in each match

Bot: Based on the provided context, here's how much Virat Kohli scored in each match:

1. **India vs Bangladesh**: Scored 64 runs in 44 balls (SR: 145.45), Team: India, Date: Not mentioned
2. **India vs Zimbabwe**: Scored 26 runs in 25 balls (SR: 104.00), Team: India, Date: Not mentioned
3. **Pakistan vs India**: Scored 82 runs in 53 balls (SR: 154.71), Team: India, Date: Not mentioned
4. **India vs South Africa**: Scored 12 runs in 11 balls (SR: