In [2]:
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
from dotenv import load_dotenv
import os
import json

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
def get_wiki_page(params):
    return requests.get('https://en.wikipedia.org/w/api.php', params=params)

In [None]:
def get_NCAA_D1_teams():
    params = {
        'action': 'parse',
        'page': "List of NCAA Division I institutions",
        'format': 'json',
        'prop': 'text',
    }
    res = get_wiki_page(params)
    if res.status_code == 200:
        page = res.json()['parse']['text']['*']
        soup = BeautifulSoup(page, 'html.parser')
        table = soup.find_all('table')[1]
        team_names = []
        for row in table.find_all('tr')[2:]:
            cells = row.find_all('td')
            if len(cells) < 3:
                continue
            nickname = cells[2].find('a')['title']
            team_names.append(nickname)
    return team_names

teams = get_NCAA_D1_teams()

In [None]:
def get_team_data(team_name):
    params = {
        'action': 'query',
        'titles': team_name,
        'format': 'json',
        'prop': 'extracts',
        'explaintext': 'true',
    }
    res = get_wiki_page(params)
    pages = res.json()['query']['pages']
    if pages:
        return pages[next(iter(pages))]['extract']
    

In [10]:
class GeminiAPI:
    def __init__(self) -> None:
        # prepare the large language model
        GEMINI_API_KEY = os.environ.get("GOOGLE_CLOUD_API_KEY")

        # Initialize the Gemini API
        genai.configure(api_key=GEMINI_API_KEY)
        self.model = genai.GenerativeModel(
            "gemini-1.5-pro-latest",
            generation_config=genai.GenerationConfig(
                max_output_tokens=8000,
                temperature=2, # 1.0 is max for gemini 1.0, 2.0 is max for gemini 1.5
            ),
        )

        # configure constants, like the question format and the number of questions
        self.question_count = 10

        self.question_list_prompt = """

        [
            {
                "team": "UCI Anteaters",
                "questions": [
                    {
                        "question": "What is the mascot of UC Irvine?",
                        "answer_options": ["Anteater", "Aggie", "Triton", "Gaucho"]
                        "correct_indices": [0]
                    }
                ]
            }
        ]

        The team should match the inputted team name. The answer_options represent possible answer choices a contestant can choose from. The correct_indices represent the index of the correct answer in the answer_options list.

        """

    def generate_system_prompt(self, team1, context):

        return f"""
    You are a sports trivia host, specializing in College Sports trivia.

    You are known for creating engaging trivia questions, and you will be creating a list of questions.

    You will create a list of {self.question_count} questions about this team:

    {team1}

    Here is an article detailing information about the team that you can use to create your questions:

    {context}

    Output your question list in a valid JSON format.

    Here is an example output. If one of the teams was the UCI Anteaters, the output could look like this:

    {self.question_list_prompt}
    """

    def generate_embeddings_from_list(self, text_list: list):
        result = genai.embed_content(model="models/text-embedding-004", content=text_list, task_type="question_answering")
        for embedding in result['embedding']:
            print(str(embedding)[:50], '... TRIMMED]')
        return result


    def generate_embeddings(self, text: str, verbose: bool = False):
        result = genai.embed_content(model="models/text-embedding-004", content=text, task_type="retrieval_document")
        if verbose:
            # Print just a part of the embedding to keep the output manageable
            print(str(result['embedding'])[:50], '... TRIMMED]')
        return result
    
    # basic function to generate questions from a prompt and return text
    def generate(self, prompt: str):
        response = self.model.generate_content(prompt)
        return response.text

    # Generate the questions, and return as an array of questions
    def generate_and_format_questions(self, team, context):

        # generate the system prompt
        prompt = self.generate_system_prompt(team, context)
        # generate the questions
        print("Generating questions...")
        response = self.generate(prompt)
        # trim, format, and return the questions
        response = response.strip()
        response = response.replace("json", "")
        response = response.replace("`", "")
        # convert to JSON format
        response = json.loads(response)
        return response

    # use this function to save the questions somewhere
    def save_questions(self, questions, team):
        # send question results to backend database
        # FIXME: For now, we will save the questions to a JSON file
        with open(f"{team}_questions.json", "w") as f:
            json.dump(questions, f)

In [None]:
# testing the embeddings?
embedder = GeminiAPI()
team1 = get_team_data(teams[0])
team1_embed_name = embedder.generate_embeddings(teams[0])

In [None]:
import re
sentences = []
def format_data(data):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', data)
    sentences = list(filter(None, map(str.strip, sentences)))

    for sentence in sentences:
        print(sentence)
    return sentences

sentences = format_data(team1)

In [11]:
from pymilvus import MilvusClient, DataType

class MilvusApp:
    def __init__(self, api_key, api_endpoint):
        self.api_key = api_key
        self.api_endpoint = api_endpoint
        self.client = self.connect()

    def connect(self):
        self.client = MilvusClient(uri=self.api_endpoint, token=self.api_key) 
        if not self.client:
            raise Exception("Client not established.")

    def disconnect(self):
        if self.client:
            self.client.close()


    def create_embedding_schema(self):
        # each entry is one article
        schema = MilvusClient.create_schema(
            auto_id=True,
            enable_dynamic_field=True
        )

        schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
        schema.add_field(field_name="team_name_vector", datatype=DataType.FLOAT_VECTOR, dim=768)
        schema.add_field(field_name="article_json", datatype=DataType.JSON)
        return schema
    

    def format_data(self, team_name, team_name_vector, article_text_list):
        data = {
            "team_name_vector": team_name_vector,
            "article_json": {
                'team_name': team_name,
                'article_content': article_text_list, 
            }  
        }
        return data
    

    def upload_data(self, collection_name, data):
        res = self.client.insert(
            collection_name=collection_name,
            data=data
        )
        print(res)


    def create_collection(self, collection_name, schema=None):
        res = self.client.list_collections()
        print(res)
        if collection_name in res:
            raise Exception(f"'{collection_name}' already exists in cluster.\n{self.client.describe_collection(collection_name=collection_name)}")
        
        index_params = self.client.prepare_index_params()
        
        index_params.add_index(
            field_name="team_name_vector", 
            index_type="AUTOINDEX", # zilliz
            metric_type="L2",
            params={}
        )
        
        if not schema:
            schema = self.create_embedding_schema()
        print(index_params)
        print(schema)
        collection = self.client.create_collection(
                collection_name=collection_name, 
                schema=schema, 
                index_params=index_params)
        return collection


    def load_collection(self, collection_name, verbose=False):
        # for searching entities purpose only 
        self.client.load_collection(
            collection_name=collection_name
        )
        res = self.client.get_load_state(
            collection_name=collection_name
        )
        if verbose:
            print(res)
    

    def drop_collection(self, collection_name):
        self.client.drop_collection(
            collection_name=collection_name
        )


    def upload_data(self, collection_name, data):
        res = self.client.insert(
            collection_name=collection_name,
            data=data
        )
        print(res)
    

    def find_data(self, collection_name, query_vectors: list, top_k=5, verbose=False):
        self.load_collection(collection_name)
        res = self.client.search(
            collection_name=collection_name,
            data=query_vectors,
            limit=top_k, # Max. number of search results to return
            search_params={"metric_type": "L2"}
        )

        result = json.dumps(res, indent=4)
        if verbose:
            print(result)
        
        result_ids = [entry["id"] for entry in res[0]]

        res = self.client.get(
            collection_name=collection_name,
            ids=result_ids
        )

        content = [entry["article_json"]["article_content"] for entry in res]

        return content
            

In [None]:
# example: creating a new collection
MILVUS_API_KEY = os.environ.get('MILVUS_API_KEY')
MILVUS_API_ENDPOINT = "https://in03-df5c1c6a3c98f62.api.gcp-us-west1.zillizcloud.com"
milvus_app = MilvusApp(api_key=MILVUS_API_KEY, api_endpoint=MILVUS_API_ENDPOINT)
milvus_app.connect()

collection_name = "Pactrivia_data"
milvus_app.create_collection(collection_name)

In [None]:
data = milvus_app.format_data(teams[0], team1_embed_name['embedding'], sentences)
data

In [None]:
milvus_app.upload_data("Pactrivia_data", data)  

In [None]:
import random
nq = 1

search_vec = [[random.random() for _ in range(768)] for _ in range(nq)]
print(search_vec)
print(len(search_vec[0]))

dd

In [None]:
result = milvus_app.find_data("Pactrivia_data", [team1_embed_name['embedding']])
print(result)

In [15]:
# integration test: generate questions from the data

# obtain the milvus information
MILVUS_API_KEY = os.environ.get('MILVUS_API_KEY')
MILVUS_API_ENDPOINT = "https://in03-df5c1c6a3c98f62.api.gcp-us-west1.zillizcloud.com"

# initialize the Milvus API
milvus_app = MilvusApp(api_key=MILVUS_API_KEY, api_endpoint=MILVUS_API_ENDPOINT)
milvus_app.connect()

# initialize the Gemini API
gemini = GeminiAPI()

# define the team we want to generate questions for
team = "Abilene Christian University"

# use milvus to find the data
# first, embed the team name
team_embedding = gemini.generate_embeddings(team)
# find the data
data = milvus_app.find_data("Pactrivia_data", [team_embedding['embedding']])

# format the data to plain text for the LLM
# we have an array of arrays
# flatten the array
flatened_context = []
for row in data:
    flatened_context.extend(row)

# join to be a string
full_context = "".join(flatened_context)
print(full_context)

# generate the questions
questions = gemini.generate_and_format_questions(team, full_context)

# save the questions
gemini.save_questions(questions, team)

{'state': <LoadState: Loaded>}
Abilene Christian Wildcats (variously ACU or ACU Wildcats) refers to the sports teams of Abilene Christian University located in Abilene, Texas.The Wildcats joined the Western Athletic Conference (WAC) on July 1, 2021, after having spent the previous eight years in the Southland Conference.The nickname "Wildcat" is derived from the mascot of the team.== Sports sponsored ==


== History ==
A member of the Western Athletic Conference, Abilene Christian sponsors teams in seven men's and eight women's NCAA sanctioned sports.On December 6, 1923, Abilene Christian applied for admission to Texas Intercollegiate Athletic Association during the conference annual meeting in Dallas.The Wildcats had been considering joining the conference for several years but funding for the athletic department preventing them to join the conference sooner.On July 1, 2013, Abilene Christian returned to the Southland Conference as one of four new members.The university, a charter mem

In [None]:
milvus_app.disconnect()