In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np
import pandas as pd
import os
#import faiss
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai
from IPython.display import Markdown

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/international-cricket-player-performance-stats/t20_batting.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_all_round.csv
/kaggle/input/international-cricket-player-performance-stats/t20_all_round.csv
/kaggle/input/international-cricket-player-performance-stats/t20_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_batting.csv
/kaggle/input/international-cricket-player-performance-stats/all_players.csv
/kaggle/input/international-cricket-player-performance-stats/country.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/fielding.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_batting.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_all_round.csv


# 🧾 Table of Contents

- [Introduction](#introduction)
- [Problem Statement](#problem-statement)
- [Proposed Solution](#proposed-solution)
- [Innovations & GenAI Techniques](#innovations--genai-techniques)
- [Dataset Loading](#dataset-loading)
- [Preprocessing & Descriptions](#preprocessing--descriptions)
- [Embedding + FAISS Setup](#embedding--faiss-setup)
- [Gemini Query Function (RAG)](#gemini-query-function-rag)
- [Advanced Features](#advanced-features)
- [Sample Queries](#sample-queries)
- [Conclusion & Future Work](#conclusion--future-work)


## Introduction
**🎯 Crick AI: A GenAI-Powered System for Cricket Intelligence and Team Strategy)**
**Probel Statemnet** 
Cricket is a multi-format sport, and evaluating player performance across T20, ODI, and Test formats is complex and context-sensitive. Traditional dashboards and statistics platforms often provide isolated views, making it difficult for analysts, coaches, or fans to:

Compare players across formats and roles
Understand player form trends
Pick format-specific teams (e.g., based on pitch conditions)
Make real-time decisions or strategic suggestions using holistic stats
There is a need for a smart, GenAI-powered system that can analyze performance, respond to natural language queries, and adapt team suggestions to match scenarios and conditions.

## Proposed Solution
Crick AI: A GenAI-Powered System for Cricket Intelligence and Team Strategy
CrickAI is an intelligent, interactive cricket analytics system that leverages Generative AI, Vector Search, RAG, and trend analysis to answer natural language queries and generate data-grounded team recommendations.

✨ Core Capabilities:
✅ Contextual player analysis across T20, ODI, and Test formats
✅ Natural language Q&A (e.g., “Who has the best T20 strike rate?”)
✅ Dream XI selection based on user context (format, pitch, location)
✅ Pitch-aware and strategy-aware recommendations
✅ AI-powered team comparison (e.g., “Which team has better all-rouners?”)
✅ Trend detection via strike rate progression over time
✅ Interactive notebook interface using ipywidgets

## Innovative Aspects & GenAI Techniques Used

Innovation Area | Implementation
🔎 RAG (Retrieval Augmented Generation) | Combines Gemini + FAISS to generate context-based answers
🧠 Embeddings + FAISS | Converts player descriptions into vectors for similarity search
🧾 Structured Output (JSON Ready) | Gemini answers can be extended to return Dream XI in JSON format
🧪 GenAI Evaluation | Compare Gemini-generated insights with stat-based selections
🔁 Few-shot Prompting | Examples guide Gemini for Dream XI logic or format-based strategies
⚔️ AI Team Comparator | Gemini compares teams based on stat reasoning
📈 Form Trend Detection | Visualize strike rate trends to track player form
🌍 Pitch + Match-Aware Strategy | Suggests players based on match conditions

**# 🧰 Step 0 Install and Import Packages**

In [64]:
!pip install faiss-cpu --quiet

# Import packages and find data file path
import numpy as np
import pandas as pd
import os
import faiss
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai
from IPython.display import Markdown

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/international-cricket-player-performance-stats/t20_batting.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_all_round.csv
/kaggle/input/international-cricket-player-performance-stats/t20_all_round.csv
/kaggle/input/international-cricket-player-performance-stats/t20_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_batting.csv
/kaggle/input/international-cricket-player-performance-stats/all_players.csv
/kaggle/input/international-cricket-player-performance-stats/country.csv
/kaggle/input/international-cricket-player-performance-stats/TEST_bowling.csv
/kaggle/input/international-cricket-player-performance-stats/fielding.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_batting.csv
/kaggle/input/international-cricket-player-performance-stats/ODI_all_round.csv


In [65]:
!pip install -q google-generativeai faiss-cpu

import google.generativeai as genai
from kaggle_secrets import UserSecretsClient
import os

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)


## Dataset Loading
**🔹 Step 1: Load Dataset**



In [66]:
import pandas as pd

# Base files
players_df = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/all_players.csv")
fielding_df = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/fielding.csv")
country_df = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/country.csv")  # Optional, for country names

# T20 format
t20_bat = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/t20_batting.csv")
t20_bowl = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/t20_bowling.csv")
t20_all_round = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/t20_all_round.csv")

# ODI format
odi_bat = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/ODI_batting.csv")
odi_bowl = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/ODI_bowling.csv")
odi_all_round = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/ODI_all_round.csv")

# Test format
test_bat = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/TEST_batting.csv")
test_bowl = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/TEST_bowling.csv")
test_all_round = pd.read_csv("/kaggle/input/international-cricket-player-performance-stats/TEST_all_round.csv")


## Preprocessing and Descriptions
**🔹 Step 2: Merge and Create Descriptions**

In [67]:
# --- PREPROCESS ---
def prefix_df(df, prefix):
    df = df.copy()
    df.columns = [f"{prefix}_{col}" if col != "id" else "player_id" for col in df.columns]
    return df

players = players_df[["id", "name", "gender", "bating_style", "bowling_style", "playing_role"]].rename(columns={"id": "player_id"})
t20_bat = prefix_df(t20_bat[["id", "runs", "strike_rate"]], "t20_bat")
t20_bowl = prefix_df(t20_bowl[["id", "wk", "bwe"]], "t20_bowl")
odi_bat = prefix_df(odi_bat[["id", "runs", "strike_rate"]], "odi_bat")
odi_bowl = prefix_df(odi_bowl[["id", "wk", "bwe"]], "odi_bowl")
test_bat = prefix_df(test_bat[["id", "runs", "strike_rate"]], "test_bat")
test_bowl = prefix_df(test_bowl[["id", "wk", "bwe"]], "test_bowl")
fielding = prefix_df(fielding_df[["id", "ct", "st"]], "fld")

# --- MERGE ---
merged_df = players.merge(t20_bat, on="player_id", how="left")\
                   .merge(t20_bowl, on="player_id", how="left")\
                   .merge(odi_bat, on="player_id", how="left")\
                   .merge(odi_bowl, on="player_id", how="left")\
                   .merge(test_bat, on="player_id", how="left")\
                   .merge(test_bowl, on="player_id", how="left")\
                   .merge(fielding, on="player_id", how="left")\
                   .fillna(0)

# --- DESCRIPTION ---
def make_description(row):
    parts = [f"{row['name']} is a {row['playing_role']} who bats {row['bating_style']} and bowls {row['bowling_style']}."]
    if row["t20_bat_runs"] > 0:
        parts.append(f"Scored {int(row['t20_bat_runs'])} runs in T20s at SR {round(row['t20_bat_strike_rate'], 1)}.")
    if row["t20_bowl_wk"] > 0:
        parts.append(f"Took {int(row['t20_bowl_wk'])} wickets in T20s with Econ {round(row['t20_bowl_bwe'], 2)}.")
    if row["odi_bat_runs"] > 0:
        parts.append(f"Scored {int(row['odi_bat_runs'])} runs in ODIs at SR {round(row['odi_bat_strike_rate'], 1)}.")
    if row["odi_bowl_wk"] > 0:
        parts.append(f"Took {int(row['odi_bowl_wk'])} wickets in ODIs with Econ {round(row['odi_bowl_bwe'], 2)}.")
    if row["test_bat_runs"] > 0:
        parts.append(f"Scored {int(row['test_bat_runs'])} runs in Tests at SR {round(row['test_bat_strike_rate'], 1)}.")
    if row["test_bowl_wk"] > 0:
        parts.append(f"Took {int(row['test_bowl_wk'])} wickets in Tests with Econ {round(row['test_bowl_bwe'], 2)}.")
    if row["fld_ct"] > 0 or row["fld_st"] > 0:
        parts.append(f"Fielded with {int(row['fld_ct'])} catches and {int(row['fld_st'])} stumpings.")
    return " ".join(parts)

merged_df["description"] = merged_df.apply(make_description, axis=1)
player_descriptions = merged_df["description"].tolist()

player_names = merged_df["name"].tolist()

## Embedding + FAISS Setup
**🔹 Step 3: Generate Gemini Embeddings**

In [68]:
# --- EMBEDDINGS ---
def generate_embeddings(texts):
    vectors = []
    for text in texts:
        response = genai.embed_content(
            model="models/embedding-001",
            content=text,
            task_type="retrieval_document"
        )
        vectors.append(response["embedding"])
    return np.array(vectors).astype("float32")

embeddings = generate_embeddings(player_descriptions)

# --- FAISS ---
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

##  Gemini Query Function (RAG)
**🔹 Step 4 and 5: Gemini + FAISS + RAG Query Function**


In [44]:
# --- RAG FUNCTION ---
def generate_rag_answer(query, top_k=10):
    query_embedding = genai.embed_content(
        model="models/embedding-001",
        content=query,
        task_type="retrieval_query"
    )["embedding"]

    query_vector = np.array(query_embedding).astype("float32").reshape(1, -1)
    _, indices = index.search(query_vector, top_k)

    context = "\n".join([f"{player_names[i]}: {player_descriptions[i]}" for i in indices[0]])

    prompt = f"""
You are a cricket analyst.

Based ONLY on the following player stats, answer the user's question:

User Query: {query}

Player Stats:
{context}

Answer:
"""
    response = genai.GenerativeModel("gemini-1.5-pro-latest").generate_content(prompt)
    return Markdown(response.text)

## Gemini Query Function (RAG)
**Step 5.1 Updated generate_rag_answer() Function (with prompt control)**

In [69]:
def generate_rag_answer(query, top_k=10):
    # Step 1: Embed the query
    query_embedding = genai.embed_content(
        model="models/embedding-001",
        content=query,
        task_type="retrieval_query"
    )["embedding"]

    # Step 2: Search in FAISS
    query_vector = np.array(query_embedding).astype("float32").reshape(1, -1)
    _, indices = index.search(query_vector, top_k)

    # Step 3: Build context from top K players
    context = "\n".join([f"{player_names[i]}: {player_descriptions[i]}" for i in indices[0]])

    # Step 4: Build prompt
    prompt = f"""
You are a cricket analyst. Use ONLY the stats below to answer the user query.

Query: {query}

Player Stats:
{context}

Answer:
"""

    # Step 5: Get Gemini Response
    model = genai.GenerativeModel("gemini-1.5-pro-latest")
    response = model.generate_content(prompt)
    return response.text


## Advanced Features
**Step 6: Intelligence Upgrades Integrated:**
Feature	Description
✅ Player Form Trend	plot_player_form_trend() plots strike rate across years
✅ Team by Pitch	suggest_team_by_pitch("spin" or "pace")
✅ Strategy by Format	strategy_by_format("T20", "ODI", or "Test")
✅ AI Team Comparator	compare_teams(["Player A",...], ["Player B",...])

In [72]:
def suggest_team_by_pitch(pitch_type="spin"):
    if pitch_type == "spin":
        return generate_rag_answer("Pick top spinners from all formats", top_k=15)
    elif pitch_type == "pace":
        return generate_rag_answer("Pick top fast bowlers for ODIs and Tests", top_k=15)
    else:
        return generate_rag_answer("Pick a balanced Dream XI team", top_k=15)

def strategy_by_format(format_type="T20"):
    if format_type == "T20":
        return generate_rag_answer("Build an aggressive T20 team with high SR batters and death bowlers", top_k=20)
    elif format_type == "ODI":
        return generate_rag_answer("Build an ODI team with balance across 50 overs", top_k=20)
    elif format_type == "Test":
        return generate_rag_answer("Pick Test players with patience and long spells", top_k=20)

def compare_teams(teamA_names, teamB_names):
    teamA_str = ", ".join(teamA_names)
    teamB_str = ", ".join(teamB_names)

    prompt = f"""
You are a cricket analyst.

Compare these two teams and tell which has better all-rounders based on total runs, batting SR, wickets, and economy.

Team A: {teamA_str}
Team B: {teamB_str}

Player Stats:
{chr(10).join([f"{name}: {desc}" for name, desc in zip(player_names, player_descriptions) if name in teamA_names + teamB_names])}

Answer:
"""
    return genai.GenerativeModel("gemini-1.5-pro-latest").generate_content(prompt).text

# --- TREND DETECTION MOCKUP ---
def plot_player_form_trend(player_id, player_name=""):
    years = [2019, 2020, 2021, 2022, 2023, 2024]
    strike_rates = [120, 128, 135, 132, 140, 145]  # Replace with real data

    plt.figure(figsize=(6, 3))
    plt.plot(years, strike_rates, marker='o')
    plt.title(f"Strike Rate Trend for {player_name}")
    plt.xlabel("Year")
    plt.ylabel("Strike Rate")
    plt.grid(True)
    plt.show()


## Sample Quries
**🧪 step 7 Sample Queries**


In [73]:
# --- SAMPLE QUERIES ---
generate_rag_answer("Who is the best all-rounder across all formats?", top_k=10)
#generate_rag_answer("Pick top 3 Test bowlers based on economy", top_k=10)
#generate_rag_answer("Which batter has the best strike rate in T20s?", top_k=10)


"Based purely on the provided statistics, **C de Grandhomme** has the strongest demonstrable all-round performance across formats.  He has significantly higher run totals and a comparable number of wickets in T20s, ODIs, and Tests compared to Afif Hossain, the only other player with cross-format data.  While Afif Hossain boasts a slightly better T20 strike rate, de Grandhomme's volume of runs and consistent wicket-taking across all three formats makes him the better performer according to these numbers alone.\n"

**🧪 Step 7.1 Sample Queries**

In [74]:
#generate_rag_answer("Who is the best all-rounder across all formats?", top_k=10)
generate_rag_answer("Pick top 3 Test bowlers based on economy", top_k=10)
generate_rag_answer("Which batter has the best strike rate in T20s?", top_k=10)


'SA Yadav has the best T20 strike rate, at 175.8.\n'

**🧪 Step 7.2 Sample Queries**

In [77]:
generate_rag_answer("Which batter has the best strike rate in T20s??", top_k=10)
generate_rag_answer("Pick top 3 Test bowlers based on economy", top_k=10)
#generate_rag_answer("Which batter has the best strike rate in T20s?", top_k=10)

'1. Qais Ahmad: Test Economy 4.17\n2. JD Unadkat: Test Economy 4.01\n3. KA Maharaj: Test Economy 4.79 \n'

**🧪 Step 7.3 Sample Queries**

In [78]:
print(generate_rag_answer("Who has the best T20 strike rate?", top_k=10))
print(suggest_team_by_pitch("pace"))
print(strategy_by_format("T20"))
#print(compare_teams(["Shakib Al Hasan"], ["Hardik Pandya", "Ben Stokes"]))


SA Yadav has the best T20 strike rate, at 175.8.

ODIs:

The top ODI fast bowlers from this list, based purely on wickets taken and economy rate, are:

1. **JJ Bumrah:** 121 wickets, 4.63 economy.
2. **TA Boult:** 187 wickets, 4.93 economy.
3. **MA Starc:** 211 wickets, 5.08 economy.
4. **K Rabada:** 137 wickets, 5.02 economy.


Tests:

The top Test fast bowlers from this list, using the same criteria, are:

1. **JJ Bumrah:** 121 wickets, 4.63 economy.
2. **TA Boult:** 187 wickets, 4.93 economy.
3. **MA Starc:** 211 wickets, 5.08 economy.
4. **K Rabada:** 137 wickets, 5.02 economy. 

Based purely on the provided T20 stats and focusing on high strike rates and death bowling ability (interpreted as good economy for bowlers), here's an aggressive T20 XI:

**Batters:**

1. **SA Yadav:** SR 175.8 (highest SR in the pool)
2. **RG Sharma:** SR 139.2 (second highest SR and proven opener)
3. **Asif Ali:** SR 134.0
4. **SS Iyer:** SR 136.0
5. **Babar Azam:** SR 127.8
6. **Faheem Ashraf:** SR 128

## ALL function Explanations
**Most Important function Explanations**

make_description(row) – Creates natural language summaries of player stats across all formats (T20, ODI, Test).

generate_embeddings(texts) – Converts these descriptions into dense vectors using Gemini embeddings.

generate_rag_answer(query, top_k) – Answers user queries by retrieving top players from FAISS and generating a response using Gemini (RAG).

suggest_team_by_pitch(pitch_type) – Suggests players based on pitch type (spin, pace, balanced).

strategy_by_format(format_type) – Builds format-specific teams (T20, ODI, Test) based on player strengths.

compare_teams(teamA, teamB) – Compares two teams and identifies which is better based on all-rounder stats.

## Conclusion and Future Directions
**Conclusion and Future Directions**
This project successfully demonstrates how Generative AI, embeddings, and vector search can be combined to build an intelligent cricket analytics system. CrickAI answers natural language queries, suggests teams based on pitch and format, and compares player performance using context-aware reasoning. It bridges the gap between raw stats and real-time insights.

In the future, the system can be enhanced by integrating real-time match feeds, player injury reports, and venue-specific performance. Adding image and video understanding (e.g., from match footage) and deploying as a fully interactive web app (Streamlit or Gradio) could further elevate user experience and decision-making.

## Blogpost and Youtube Video

Blogpost link: https://medium.com/@zrehman_40790/crickai-a-genai-powered-system-for-cricket-intelligence-and-team-strategy-af76a67a3408
Youtube Video:Nil