### **Implementation of Retrieval-Augmented Generation (RAG) using GPT-4 for IPL 2024 news articles.**

### Azure Open AI Configuration

In [1]:
%%capture
!pip3 install openai --upgrade

In [2]:
import os
from openai import AzureOpenAI

In [3]:
from google.colab import userdata
key= userdata.get('OAIKEY')

In [4]:
client = AzureOpenAI(
    api_key=key,
    api_version="2024-02-01",
    azure_endpoint = "https://ragprojectv1.openai.azure.com/"
)

gpt_four = "gpt-four-oai"
emd_deployment_name = "adaembedoai" # embedding model

In [5]:
# Test Connection
prompt = "Tell me a funny joke"

response = client.chat.completions.create(
    model= gpt_four, # model = "deployment_name".
    messages=[
        {"role": "system", "content": "Act as a standup comdeian"},
        {"role": "user", "content": prompt}
    ], max_tokens= 25, temperature= 0
)

print(response.choices[0].message.content)

Sure, here's one for you:

Why don't scientists trust atoms?

Because they make up everything!


In [6]:
response = client.embeddings.create(
    input = "Your text string goes here",
    model= emd_deployment_name  # model = "deployment_name".
)

In [7]:
len(response.data[0].embedding)

1536

### Data Ingestion and Processing

In [8]:
%%capture
!pip3 install -qU langchain-community \
  langchain-core \
  pinecone-client \
  langchain-pinecone \
  newspaper3k \
  tiktoken\
  evaluate

In [9]:
# Define the URLs of the articles
url = [ "https://www.financialexpress.com/sports/ipl/kkr-vs-srh-qualifier-1-live-scorecard-ipl-2024-match-71-kolkata-knight-riders-vs-sunrisers-hyderabad-live-score/3495970/",
        "https://www.financialexpress.com/sports/ipl/rr-vs-rcb-live-match-score-ipl-2024-rajasthan-royals-vs-royal-challengers-bengaluru-eliminator-live-match-updates-scorecard/3497628/",
        "https://www.financialexpress.com/sports/ipl/srh-vs-rr-live-score-sunrisers-hyderabad-vs-rajasthan-royals-scorecard-qualifier-2-may-24-ipl-match-today-live-updates/3500393/",
        "https://www.financialexpress.com/sports/ipl/kkr-vs-srh-live-score-ipl-2024-final-match-live-updates-kolkata-knight-riders-vs-sunrisers-hyderabad-ipl-final-may-26-today-scorecard-latest-updates/3501995/"
]

In [10]:
# Import necessary modules
from newspaper import Article
import pandas as pd

# Function to extract article text from a given URL
def extract_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

# Extract text for each article
data = {
    'source': [],
    'text': []
}

for url in url:
    text = extract_article_text(url)
    data['source'].append(url)
    data['text'].append(text)

# Create a DataFrame
df = pd.DataFrame(data)

In [11]:
import re

def clean_ipl_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Replace specific characters with a space
    text = re.sub(r"[@#|)'(]", ' ', text)

    text = re.sub(r'pic\.twitter\.com/[\w\d]+', '', text)

    # Remove Emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Replace multiple spaces or newlines with a single space
    text = re.sub(r'\s+', ' ', text)

    # Trim leading and trailing whitespace
    text = text.strip()

    return text

In [12]:
df['text'] = df['text'].apply(clean_ipl_text)

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [14]:
# Splitting text into chunks

def chunk_text(text, chunk_size=500, chunk_overlap= 50):
    splitter = RecursiveCharacterTextSplitter(
        separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", "."])
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_text(text)
    return chunks

df['chunks'] = df['text'].apply(chunk_text)

In [15]:
# Exploded the dataframe for embeddings
flattened_df = df.explode('chunks')

In [16]:
# Added two new columns for creation of id
flattened_df['year'] = '2024'
flattened_df['no'] =  range(1, len(flattened_df)+1)

In [17]:
flattened_df.shape

(42, 5)

In [18]:
flattened_df.loc[flattened_df['no'] ==  3]['chunks'][0]

'but SRHs batting collapsed. Pat Cummins added a crucial 30 off 24, partnering with Vijayakanth Viyaskanth for a 33-run last-wicket stand to push SRH past 150. Starcs opening spell of 3-0-22-3 set the tone for KKRs dominance. In reply, KKR skipper Shreyas Iyer and top order batter Venkatesh Iyer added 97 runs for the third wicket stand that sealed the finale spot for the Purple Army! KKR owner Shah Rukh Khan was seen greeting the audience with lorbo, korbo, jeetbo theme song playing at Narendra'

In [19]:
def create_embeddings(text, model=emd_deployment_name):
    # Create embeddings for each document chunk
    embeddings = client.embeddings.create(input = text, model=model).data[0].embedding
    return embeddings

In [None]:
# create embeddings for the whole data chunks and store them in a list

embeddings = []
for chunk in flattened_df['chunks']:
    embeddings.append(create_embeddings(chunk))

# store the embeddings in the dataframe
flattened_df['embeddings'] = embeddings

In [None]:
# create a id and metadata columns
flattened_df['id'] = flattened_df['year'].astype(str) + '_' + flattened_df['no'].astype(str)
flattened_df['metadata'] = flattened_df.apply(lambda x: { 'text': x['chunks'],'source': x['source']}, axis=1)

In [None]:
flattened_df.head()

Unnamed: 0,source,text,chunks,year,no,embeddings,id,metadata
0,https://www.financialexpress.com/sports/ipl/kk...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,2024,1,"[0.007586794439703226, -0.020128775388002396, ...",2024_1,{'text': 'Kolkata Knight Riders vs Sunrisers H...
0,https://www.financialexpress.com/sports/ipl/kk...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,"Mitchell Starc, IPLs costliest buy, delivered ...",2024,2,"[-0.005107360891997814, 0.008798280730843544, ...",2024_2,"{'text': 'Mitchell Starc, IPLs costliest buy, ..."
0,https://www.financialexpress.com/sports/ipl/kk...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,but SRHs batting collapsed. Pat Cummins added ...,2024,3,"[0.00671883812174201, -0.003964687697589397, 0...",2024_3,{'text': 'but SRHs batting collapsed. Pat Cumm...
0,https://www.financialexpress.com/sports/ipl/kk...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,"korbo, jeetbo theme song playing at Narendra M...",2024,4,"[-0.013291221112012863, -0.026689574122428894,...",2024_4,"{'text': 'korbo, jeetbo theme song playing at ..."
0,https://www.financialexpress.com/sports/ipl/kk...,Kolkata Knight Riders vs Sunrisers Hyderabad Q...,of RCB fixtures and list of matches with venue...,2024,5,"[-0.0010939022758975625, -0.010952485725283623...",2024_5,{'text': 'of RCB fixtures and list of matches ...


### Pinecone Index Configuration

In [20]:
PC_KEY = userdata.get('PC_TOKEN')

In [21]:
from pinecone import Pinecone
from pinecone import ServerlessSpec

In [22]:
# Configure Pinecone Vectorbase Client
pc = Pinecone(api_key=PC_KEY)

# Config Pinecone ServerlessSpec
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

# Create a index in Pinecone

index_name = 'ipl-rag-2024'

In [23]:
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=len(response.data[0].embedding),
        metric='cosine',
        spec=spec
    )

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 42}},
 'total_vector_count': 42}

In [None]:
# Upserting vectors and metadata in index
for _, row in flattened_df.iterrows():
    record = {
        "id": row["id"],
        "values": row["embeddings"],
        "metadata": row["metadata"]
    }
    index.upsert(vectors=[record])

In [None]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 42}},
 'total_vector_count': 42}

### Retrieval Of Relevant Documents

In [24]:
query = "Who won RR vs RCB eliminator IPL match?"

query_vectors = create_embeddings(query)

In [25]:
# get relevant contexts (including the questions)
result = index.query(vector=query_vectors, top_k= 3, include_metadata=True)

In [26]:
result

{'matches': [{'id': '2024_12',
              'metadata': {'source': 'https://www.financialexpress.com/sports/ipl/rr-vs-rcb-live-match-score-ipl-2024-rajasthan-royals-vs-royal-challengers-bengaluru-eliminator-live-match-updates-scorecard/3497628/',
                           'text': 'the Rajasthan Royals to a 4-wicket victory '
                                   'over the Royal Challengers Bangalore. '
                                   'Chasing a target of 173 runs set by RCB, '
                                   'RR successfully reached the mark. With '
                                   'this win, RR moves to the second '
                                   'eliminator against SRH on Friday, while '
                                   'RCBs journey in this IPL season comes to '
                                   'an end. 23:21 IST 22 May 2024 RR vs RCB '
                                   'Live Score, IPL 2024 Eliminator: RR still '
                                   'on game Rovman Po

In [None]:
# get list of retrieved text
contexts = [item['metadata']['text'] for item in result['matches']]

relevnat_docs = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

In [None]:
print(relevnat_docs)

the Rajasthan Royals to a 4-wicket victory over the Royal Challengers Bangalore. Chasing a target of 173 runs set by RCB, RR successfully reached the mark. With this win, RR moves to the second eliminator against SRH on Friday, while RCBs journey in this IPL season comes to an end. 23:21 IST 22 May 2024 RR vs RCB Live Score, IPL 2024 Eliminator: RR still on game Rovman Powell hit 4 twice. 5 More to go. 23:20 IST 22 May 2024 RR vs RCB Live Score, IPL 2024 Eliminator: RCB takes another wicket

---

consecutive wickets, RR proved themselves again. Match Ended Indian Premier League, 2024 Rajasthan Royals 174/6 19.0 vs Royal Challengers Bengaluru 172/8 20.0 Match Ended Day Eliminator Rajasthan Royals beat Royal Challengers Bengaluru by 4 wickets View Scorecard HIGHLIGHTS of RR vs RCB Eliminator Match 72 Live Updates 23:25 IST 22 May 2024 RR vs RCB Live Score, IPL 2024 Eliminator: RR win by 4 wickets Riyan Parag 36 and Shimron Hetmyer 26 guided the Rajasthan Royals to a 4-wicket victory over

### Ask a question to get answer

In [27]:
# Initialize a global cache dictionary
cache = {}

def generate_answer(user_input):
    # Check if the user_input is already in the cache
    if user_input in cache:
        print("Cache hit, Found the Answer!")
        return cache[user_input]

    # Convert the question to a query vector
    query_vector = create_embeddings(user_input)

    # get relevant contexts to answer question
    result = index.query(vector=query_vector, top_k=3, include_metadata=True)

    # get list of retrieved text
    context_data = [item['metadata']['text'] for item in result['matches']]
    context = "\n\n---\n\n".join(context_data) + "\n\n-----\n\n" + user_input

    # create a message object
    messages = [
        {"role": "system", "content": "You are an AI assistant that answers based on the given context. Please don't answer if you don't know."},
        {"role": "user", "content": context}
    ]

    # use chat completion to generate a response
    response = client.chat.completions.create(
        model=gpt_four,
        temperature=0,
        max_tokens=50,
        messages=messages
    )

    answer = response.choices[0].message.content

    # Store the response in the cache
    cache[user_input] = answer

    return answer


In [None]:
user_input = "Who won RR vs RCB eliminator match"

generate_answer(user_input)

'The Rajasthan Royals (RR) won the eliminator match against the Royal Challengers Bangalore (RCB) by 4 wickets.'

An in-memory cache dictionary is utilized to store and quickly retrieve responses for repeated queries, minimizing redundant computations and reducing overall costs.

In [None]:
user_input = "Who won RR vs RCB eliminator match"

generate_answer(user_input)

Cache hit, Found the Answer!


'The Rajasthan Royals (RR) won the eliminator match against the Royal Challengers Bangalore (RCB) by 4 wickets.'

In [111]:
user_input = "Give me a short summary of IPL 2024 final match between KKR vs SRH"

generate_answer(user_input)

'In the IPL 2024 final, the Kolkata Knight Riders (KKR) emerged victorious against the Sunrisers Hyderabad (SRH) by 8 wickets. SRH scored 113 runs in 18.3 overs, while KKR chased'

In [112]:
user_input = "Give me a short summary of IPL 2024 final match between KKR vs SRH"

generate_answer(user_input)

Cache hit, Found the Answer!


'In the IPL 2024 final, the Kolkata Knight Riders (KKR) emerged victorious against the Sunrisers Hyderabad (SRH) by 8 wickets. SRH scored 113 runs in 18.3 overs, while KKR chased'

In [32]:
user_input = "Give me a short summary of IPL Qualifier IPL match happedn between SRH vs RR"

generate_answer(user_input)

"In the IPL 2024 Qualifier 2 match between Sunrisers Hyderabad (SRH) and Rajasthan Royals (RR) held at the MA Chidambaram Stadium, RR's skipper Sanju Samson won the toss and opted to field first"

In [34]:
user_input = "Who won Qualifier 2 IPl match between SRH and RR "

generate_answer(user_input)

'Sunrisers Hyderabad (SRH) won the Qualifier 2 IPL match against Rajasthan Royals (RR) by 36 runs.'

In [36]:
user_input = "Who won first qualifier match played between KKR vs SRH"

generate_answer(user_input)

'Kolkata Knight Riders (KKR) won the first qualifier match against Sunrisers Hyderabad (SRH) by 8 wickets with 38 balls to spare.'

### Evaluation of RAG implementation

In [30]:
!pip3 -q install rouge_score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [31]:
import evaluate
rouge = evaluate.load('rouge')

In [50]:
generated_responses = ["The Rajasthan Royals (RR) won the eliminator match against the Royal Challengers Bangalore (RCB) by 4 wickets",
                       "In the IPL 2024 final, the Kolkata Knight Riders (KKR) emerged victorious against the Sunrisers Hyderabad (SRH) by 8 wickets. SRH scored 113 runs in 18.3 overs, while KKR chased.",
                       "In the IPL 2024 Qualifier 2 match between Sunrisers Hyderabad (SRH) and Rajasthan Royals (RR) held at the MA Chidambaram Stadium, RR's skipper Sanju Samson won the toss and opted to field first, Sunrisers Hyderabad (SRH) won the Qualifier 2 IPL match against Rajasthan Royals (RR) by 36 runs.",
                       "Kolkata Knight Riders (KKR) won the first qualifier match against Sunrisers Hyderabad (SRH) by 8 wickets with 38 balls to spare."
                       ]

In [53]:
reference_responses = ["Rajasthan Royals (RR) secured victory in the eliminator match against the Royal Challengers Bangalore (RCB) with a 4-wickets in hand.",
                       "Kolkata Knight Riders (KKR) triumphed over Sunrisers Hyderabad (SRH) by 8 wickets. SRH managed 113 runs in 18.3 overs, and KKR successfully chased the target.",
                       "In the IPL 2024 Qualifier 2 at MA Chidambaram Stadium, Rajasthan Royals (RR), led by skipper Sanju Samson, elected to field after winning the toss. Sunrisers Hyderabad (SRH) emerged victorious by 36 runs against RR",
                       "Kolkata Knight Riders (KKR) vs Sunrisers Hyderabad (SRH) IPL 2024 Qualifier 1 match won by KKR defeating SRH by 8 wickets to reach final"]

In [66]:
def eval_response(list1, list2):
  results = rouge.compute(predictions=list1, references=list2)
  print("The RougeL score generated by the OpenAI LLM is:", results['rougeL'])

In [67]:
eval_response(generated_responses, reference_responses)

The RougeL score generated by the OpenAI LLM is: 0.6025111799105607
