In [None]:
# Import required libraries
%%capture
!pip3 install -qU \
  transformers==4.31.0 \
  sentence-transformers==2.2.2 \
  accelerate==0.21.0 \
  einops==0.6.1 \
  xformers==0.0.20 \
  bitsandbytes==0.41.0 \
  langchain-community \
  langchain-core \
  pinecone-client \
  langchain-pinecone \
  newspaper3k

In [None]:
import os
from pinecone import Pinecone
from pinecone import ServerlessSpec
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.llms import HuggingFacePipeline

In [None]:
from google.colab import userdata
HF_KEY = userdata.get('HF_TOKEN')
PC_KEY = userdata.get('PC_TOKEN')

In [None]:
# Configuration of embedding model
from torch import cuda
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

embed_model = HuggingFaceEmbeddings(
    model_name=embed_model_id,
    model_kwargs={'device': device},
    encode_kwargs={'device': device, 'batch_size': 32}
)

.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
# Configure Pinecone Vectorbase Client
pc = Pinecone(api_key=PC_KEY)

# Config Pinecone ServerlessSpec
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

# Create a index in Pinecone

index_name = 'text-gen-ipl'

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,
        metric='cosine',
        spec=spec
    )

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
# Import necessary modules
from newspaper import Article
import pandas as pd

# Define the URLs of the articles
urls = [
    "https://www.hindustantimes.com/cricket/ipl-live-score-2024-mi-vs-csk-match-29-mumbai-indians-vs-chennai-super-kings-in-wankhede-stadium-ipl-match-14-april-101713084478438.html",
    "https://www.hindustantimes.com/cricket/csk-vs-srh-ipl-live-score-2024-match-46-chennai-super-kings-vs-sunrisers-hyderabad-ipl-match-live-28-april-101714294567393.html",
    "https://www.livemint.com/sports/cricket-news/rcb-vs-csk-live-score-ipl-2024-match-weather-rain-18-may-head-to-head-dream-11-bengaluru-vs-chennai-kohli-dhoni-11716021227736-page-96.html"
]

# Function to extract article text from a given URL
def extract_article_text(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text

# Extract text for each article
data = {
    'source': [],
    'text': []
}

for url in urls:
    text = extract_article_text(url)
    data['source'].append(url)
    data['text'].append(text)

# Create a DataFrame
df = pd.DataFrame(data)


In [None]:
# Splitting text into chunks

def chunk_text(text, chunk_size=350, chunk_overlap= 15):
    splitter = RecursiveCharacterTextSplitter(
        separators = ["\n\n", "\n", " "],  # List of separators based on requirement (defaults to ["\n\n", "\n", "."])
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_text(text)
    return chunks

df['chunks'] = df['text'].apply(chunk_text)

In [None]:
# Exploded the dataframe for embeddings
flattened_df = df.explode('chunks')

In [None]:
# Added two new columns for creation of id
flattened_df['year'] = '2024'
flattened_df['no'] =  range(1, len(flattened_df)+1)

In [None]:
# Added embedding into a dataframe
flattened_df['embeddings'] = flattened_df['chunks'].apply(lambda x: embed_model.embed_documents([x])[0])
flattened_df['id'] = flattened_df['year'].astype(str) + '_' + flattened_df['no'].astype(str)
flattened_df['metadata'] = flattened_df.apply(lambda x: { 'text': x['chunks'],'source': x['source']}, axis=1)

In [None]:
flattened_df.shape

(24, 8)

In [None]:
flattened_df.head(5)

Unnamed: 0,source,text,chunks,year,no,embeddings,id,metadata
0,https://www.hindustantimes.com/cricket/ipl-liv...,IPL 2024 MI vs CSK Highlights: An extraordinar...,IPL 2024 MI vs CSK Highlights: An extraordinar...,2024,1,"[-0.11759593337774277, 0.1136249527335167, -0....",2024_1,{'text': 'IPL 2024 MI vs CSK Highlights: An ex...
0,https://www.hindustantimes.com/cricket/ipl-liv...,IPL 2024 MI vs CSK Highlights: An extraordinar...,Ishan Kishan putting up an opening partnership...,2024,2,"[0.0033631916157901287, 0.10162778943777084, -...",2024_2,{'text': 'Ishan Kishan putting up an opening p...
0,https://www.hindustantimes.com/cricket/ipl-liv...,IPL 2024 MI vs CSK Highlights: An extraordinar...,MI were then given a boost thanks to a 60-run ...,2024,3,"[-0.0158567875623703, 0.034705981612205505, -0...",2024_3,{'text': 'MI were then given a boost thanks to...
0,https://www.hindustantimes.com/cricket/ipl-liv...,IPL 2024 MI vs CSK Highlights: An extraordinar...,in the 18th over. Rohit ended up scoring a cen...,2024,4,"[0.0503312349319458, 0.04003886133432388, -0.0...",2024_4,{'text': 'in the 18th over. Rohit ended up sco...
0,https://www.hindustantimes.com/cricket/ipl-liv...,IPL 2024 MI vs CSK Highlights: An extraordinar...,"Earlier, MI seemed to have pulled things back ...",2024,5,"[-0.020593110471963882, 0.10594867914915085, -...",2024_5,"{'text': 'Earlier, MI seemed to have pulled th..."


In [None]:
# For big dataset use batch method to upsert the data
# batch_size = 500
# for i in range(0, len(flattened_df), batch_size):
#     i_end = min(len(flattened_df), i+batch_size)
#     batch = flattened_df.iloc[i:i_end]
#     ids = [f"{x['year']}-{x['no']}" for i, x in batch.iterrows()]
#     texts = [x['chunks'] for i, x in batch.iterrows()]
#     embeds = [x['embed'] for i, x in batch.iterrows()]
#     # get metadata to store in Pinecone
#     metadata = [{'text': x['chunks'],
#                  'source': x['source']} for i, x in batch.iterrows()
#     ]
#     # add to Pinecone
#     index.upsert(vectors=zip(ids, embeds, metadata))

In [None]:
# Upserting vectors and metadata in index
for _, row in flattened_df.iterrows():
    record = {
        "id": row["id"],
        "values": row["embeddings"],
        "metadata": row["metadata"]
    }
    index.upsert(vectors=[record])

In [None]:
# Check the index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 24}},
 'total_vector_count': 24}

In [None]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-13b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4', # adapted for weights initialized
    bnb_4bit_use_double_quant=True, # Nested quantization is a technique that can save additional memory at no additional performance cost.
    bnb_4bit_compute_dtype=bfloat16 #To speedup computation, you can change the data type from float32 (the default value) to bf16
)

# begin initializing HF items, need auth token for these
hf_auth = HF_KEY
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)
model.eval()
print(f"Model loaded on {device}")



config.json:   0%|          | 0.00/587 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [None]:
# Configuration of tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
# Configuration of model and toeknizer
generate_text = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    temperature=0.3,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=300,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [None]:
# LLM Response
res = generate_text("If 1 and 1 is 11. If 2 and 2 is 22 Then What is 3 and 3?")
print(res[0]["generated_text"])

If 1 and 1 is 11. If 2 and 2 is 22 Then What is 3 and 3?

This is a classic lateral thinking puzzle that requires you to think outside the box. The answer is not a simple mathematical calculation, but rather a clever play on words.

The key to solving this puzzle is to understand that the numbers 1, 2, and 3 are being used as labels for objects, rather than as numerical values. In this case, the objects are "one" "two" and "three" themselves.

So, if 1 and 1 is 11, then 2 and 2 is 22, and 3 and 3 is 33.

Therefore, the answer to the question "What is 3 and 3?" is "33".


In [None]:
# Define the LLM i.e LLAMA2
llm = HuggingFacePipeline(pipeline=generate_text)

In [None]:
# Vector Retrieval
from langchain_pinecone import PineconeVectorStore
text_field = "text"
vectorstore = PineconeVectorStore( index, embed_model, text_field )

In [None]:
# Prompt template and Retrieval chain to retriev relvant information from Vector DB
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

{context}

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

retriever = vectorstore.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
# Ask Question
response = retrieval_chain.invoke({"input": "Who won CSK vs RCB IPL 2024 match?"})
print(response["answer"])

Human: Answer the following question based only on the provided context:

RCB vs CSK Highlights, IPL 2024: Royal Challengers Bengaluru triumphed against Chennai Super Kings in the clash of titans and made their way to the IPL 2024 playoffs. Faf du Plessis team defeated Ruturaj Gaikwad-led squad by 27 runs to join Kolkata Knight Riders, Rajasthan Royals and Sunrisers Hyderabad in the IPL 2024 playoffs.

All you need to know about CSK vs SRH clash in IPL 2024:

-Ruturaj Gaikwad was named the Player of the Match for his 54-ball 98.

-Chennai Super Kings (212/3) beat Sunrisers Hyderabad (134) by 78 runs in Chennai.

-MS Dhoni recorded his 150th win in the IPL.

-Tushar Deshpande recorded his career-best figures (4 for 27) in 3 overs.

The stage is set for one of the most crucial matches in the 2024 edition of the Indian Premier League (IPL) as Royal Challengers Bengaluru (RCB) goes against Chennai Super Kings on 18 May at the M. Chinnaswamy Stadium, Bengaluru. The match will decide if RCB'