In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
def get_wiki_page(params):
    return requests.get('https://en.wikipedia.org/w/api.php', params=params)

In [5]:
def get_NCAA_D1_teams():
    params = {
        'action': 'parse',
        'page': "List of NCAA Division I institutions",
        'format': 'json',
        'prop': 'text',
    }
    res = get_wiki_page(params)
    if res.status_code == 200:
        page = res.json()['parse']['text']['*']
        soup = BeautifulSoup(page, 'html.parser')
        table = soup.find_all('table')[1]
        team_names = []
        for row in table.find_all('tr')[2:]:
            cells = row.find_all('td')
            if len(cells) < 3:
                continue
            nickname = cells[2].find('a')['title']
            team_names.append(nickname)
    return team_names

teams = get_NCAA_D1_teams()

In [6]:
def get_team_data(team_name):
    params = {
        'action': 'query',
        'titles': team_name,
        'format': 'json',
        'prop': 'extracts',
        'explaintext': 'true',
    }
    res = get_wiki_page(params)
    pages = res.json()['query']['pages']
    if pages:
        return pages[next(iter(pages))]['extract']
    

In [7]:
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()

GEMINI_API_KEY = os.environ.get("GOOGLE_CLOUD_API_KEY")

# Initialize the Gemini API
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(
    "gemini-1.5-pro-latest",
    generation_config=genai.GenerationConfig(
        max_output_tokens=8000,
        temperature=0,
    ),
)

def generate_embeddings_from_list(text_list: list):
    result = genai.embed_content(model="models/text-embedding-004", content=text_list, task_type="question_answering")
    for embedding in result['embedding']:
        print(str(embedding)[:50], '... TRIMMED]')
    return result


def generate_embeddings(text: str):
    result = genai.embed_content(model="models/text-embedding-004", content=text, task_type="retrieval_document")
    # Print just a part of the embedding to keep the output manageable
    print(str(result['embedding'])[:50], '... TRIMMED]')
    return result

In [8]:
team1 = get_team_data(teams[0])
team1_embed_name = generate_embeddings(teams[0])

[0.0170326, 0.020390132, -0.026702601, 0.046677213 ... TRIMMED]


In [9]:
import re
sentences = []
def format_data(data):
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', data)
    sentences = list(filter(None, map(str.strip, sentences)))

    for sentence in sentences:
        print(sentence)
    return sentences

sentences = format_data(team1)

Abilene Christian Wildcats (variously ACU or ACU Wildcats) refers to the sports teams of Abilene Christian University located in Abilene, Texas.
The Wildcats joined the Western Athletic Conference (WAC) on July 1, 2021, after having spent the previous eight years in the Southland Conference.
The nickname "Wildcat" is derived from the mascot of the team.
== Sports sponsored ==


== History ==
A member of the Western Athletic Conference, Abilene Christian sponsors teams in seven men's and eight women's NCAA sanctioned sports.
On December 6, 1923, Abilene Christian applied for admission to Texas Intercollegiate Athletic Association during the conference annual meeting in Dallas.
The Wildcats had been considering joining the conference for several years but funding for the athletic department preventing them to join the conference sooner.
On July 1, 2013, Abilene Christian returned to the Southland Conference as one of four new members.
The university, a charter member of the Southland Con

In [54]:
from pymilvus import MilvusClient, DataType
import os
import json
from dotenv import load_dotenv

load_dotenv()


class MilvusApp:
    def __init__(self, api_key, api_endpoint):
        self.api_key = api_key
        self.api_endpoint = api_endpoint
        self.client = self.connect()

    def connect(self):
        self.client = MilvusClient(uri=self.api_endpoint, token=self.api_key) 
        if not self.client:
            raise Exception("Client not established.")

    def disconnect(self):
        if self.client:
            self.client.close()


    def create_embedding_schema(self):
        # each entry is one article
        schema = MilvusClient.create_schema(
            auto_id=True,
            enable_dynamic_field=True
        )

        schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
        schema.add_field(field_name="team_name_vector", datatype=DataType.FLOAT_VECTOR, dim=768)
        schema.add_field(field_name="article_json", datatype=DataType.JSON)
        return schema
    

    def format_data(self, team_name, team_name_vector, article_text_list):
        data = {
            "team_name_vector": team_name_vector,
            "article_json": {
                'team_name': team_name,
                'article_content': article_text_list, 
            }  
        }
        return data
    

    def upload_data(self, collection_name, data):
        res = self.client.insert(
            collection_name=collection_name,
            data=data
        )
        print(res)


    def create_collection(self, collection_name, schema=None):
        res = self.client.list_collections()
        print(res)
        if collection_name in res:
            raise Exception(f"'{collection_name}' already exists in cluster.\n{self.client.describe_collection(collection_name=collection_name)}")
        
        index_params = self.client.prepare_index_params()
        
        index_params.add_index(
            field_name="team_name_vector", 
            index_type="AUTOINDEX", # zilliz
            metric_type="L2",
            params={}
        )
        
        if not schema:
            schema = self.create_embedding_schema()
        print(index_params)
        print(schema)
        collection = self.client.create_collection(
                collection_name=collection_name, 
                schema=schema, 
                index_params=index_params)
        return collection


    def load_collection(self, collection_name):
        # for searching entities purpose only 
        self.client.load_collection(
            collection_name=collection_name
        )
        res = self.client.get_load_state(
            collection_name=collection_name
        )
        print(res)
    

    def drop_collection(self, collection_name):
        self.client.drop_collection(
            collection_name=collection_name
        )


    def upload_data(self, collection_name, data):
        res = self.client.insert(
            collection_name=collection_name,
            data=data
        )
        print(res)
    

    def find_data(self, collection_name, query_vectors: list, top_k=5):
        self.load_collection(collection_name)
        res = self.client.search(
            collection_name=collection_name,
            data=query_vectors,
            limit=top_k, # Max. number of search results to return
            search_params={"metric_type": "L2"}
        )

        result = json.dumps(res, indent=4)
        print(result)
        
        result_ids = [entry["id"] for entry in res[0]]

        res = self.client.get(
            collection_name=collection_name,
            ids=result_ids
        )

        content = [entry["article_json"]["article_content"] for entry in res]

        return content
            

MILVUS_API_KEY = os.environ.get('MILVUS_API_KEY')
MILVUS_API_ENDPOINT = "https://in03-df5c1c6a3c98f62.api.gcp-us-west1.zillizcloud.com"
milvus_app = MilvusApp(api_key=MILVUS_API_KEY, api_endpoint=MILVUS_API_ENDPOINT)
milvus_app.connect()

collection_name = "Pactrivia_data"
milvus_app.create_collection(collection_name)

['medium_articles', 'Pactrivia_data']


Exception: 'Pactrivia_data' already exists in cluster.
{'collection_name': 'Pactrivia_data', 'auto_id': True, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'auto_id': True, 'is_primary': True}, {'field_id': 101, 'name': 'team_name_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}, {'field_id': 102, 'name': 'article_json', 'description': '', 'type': <DataType.JSON: 23>, 'params': {}}], 'aliases': [], 'collection_id': 449686286190462718, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': True}

In [35]:
data = milvus_app.format_data(teams[0], team1_embed_name['embedding'], sentences)
data

{'team_name_vector': [0.0170326,
  0.020390132,
  -0.026702601,
  0.046677213,
  0.039016515,
  0.007283795,
  0.009143837,
  0.029600112,
  -0.026886836,
  0.025534192,
  0.025696287,
  -0.007201744,
  0.012390414,
  0.009116536,
  -0.004104345,
  -0.0038682777,
  0.024002297,
  0.042020816,
  -0.116137594,
  -0.012162356,
  -0.0050529717,
  -0.044144507,
  0.03629511,
  0.015217971,
  -0.015468548,
  -0.03336388,
  0.024729151,
  -0.04968274,
  -0.036115717,
  -0.003124638,
  0.017102432,
  0.061775114,
  0.035321817,
  -0.07389817,
  -0.021242108,
  0.051807176,
  0.0006799167,
  -0.00852795,
  0.09410862,
  -0.047147274,
  -0.076967046,
  0.04205411,
  -0.038590286,
  0.014241451,
  0.0065736067,
  -0.023374798,
  -0.013462055,
  0.0382384,
  -0.05141897,
  0.04079918,
  -0.010313023,
  0.087015696,
  -0.06790516,
  0.024281228,
  -0.01561109,
  -0.01944819,
  -0.03475458,
  -0.06838631,
  0.0062594297,
  -0.03233706,
  0.023643918,
  0.0037873492,
  0.03271537,
  0.0041422443,
  0

In [36]:
milvus_app.upload_data("Pactrivia_data", data)  

{'insert_count': 1, 'ids': [449686286190256014]}


In [37]:
import random
nq = 1

search_vec = [[random.random() for _ in range(768)] for _ in range(nq)]
print(search_vec)
print(len(search_vec[0]))

[[0.3217914390351295, 0.07762192541663149, 0.9292594905290869, 0.5416767213404207, 0.16949902088128654, 0.8219537499109135, 0.9063444911706319, 0.2600075153083907, 0.2541801952846926, 0.9367131651644572, 0.33988376920294594, 0.9319843975329516, 0.6840328964949662, 0.2045770321923056, 0.8435896506625774, 0.6345956416985522, 0.4444178370346211, 0.5527586184471586, 0.13993474588538557, 0.053043477015384366, 0.3162211080147793, 0.7228033191769008, 0.9034727000968198, 0.25525394355890796, 0.6945321203131273, 0.8090007074277619, 0.652984836425525, 0.14732273514680805, 0.7415924394009343, 0.5542666147830039, 0.8288799177821338, 0.4881872133024957, 0.9273854656587275, 0.9528095006168474, 0.7628827447244171, 0.34669446700137385, 0.04234286282003685, 0.21269463708202796, 0.5976454578118289, 0.30099131627868136, 0.273010812260598, 0.5120503329228646, 0.030461762433460993, 0.3622987753909075, 0.4827413234936426, 0.04871266063196089, 0.496994732945763, 0.9843309356856722, 0.9041543094311407, 0.4210

dd

In [55]:
result = milvus_app.find_data("Pactrivia_data", [team1_embed_name['embedding']])
print(result)

{'state': <LoadState: Loaded>}
[
    [
        {
            "id": 449686286190256014,
            "distance": 0.0,
            "entity": {}
        },
        {
            "id": 449686286190255884,
            "distance": 0.0,
            "entity": {}
        }
    ]
]
2
[['Abilene Christian Wildcats (variously ACU or ACU Wildcats) refers to the sports teams of Abilene Christian University located in Abilene, Texas.', 'The Wildcats joined the Western Athletic Conference (WAC) on July 1, 2021, after having spent the previous eight years in the Southland Conference.', 'The nickname "Wildcat" is derived from the mascot of the team.', "== Sports sponsored ==\n\n\n== History ==\nA member of the Western Athletic Conference, Abilene Christian sponsors teams in seven men's and eight women's NCAA sanctioned sports.", 'On December 6, 1923, Abilene Christian applied for admission to Texas Intercollegiate Athletic Association during the conference annual meeting in Dallas.', 'The Wildcats had be

In [26]:
milvus_app.disconnect()