In [11]:
import json
import re

input_file = 'news-sample.jsonl'  #Input JSON
output_file = 'news-sample-1.jsonl'  #Output JSON

#Removes non ASCII chars
non_ascii_re = re.compile(r'[^\x00-\x7F]')

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        record = json.loads(line)
        cleaned_record = {k: non_ascii_re.sub(' ', v) if isinstance(v, str) else v for k, v in record.items()}
        outfile.write(json.dumps(cleaned_record) + '\n')


In [16]:
import json
#from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import PineconeException
import pinecone

PINECONE_API_KEY = "63651404-1b59-483a-9c49-0b5c6b3aa9ee"
pc = Pinecone(api_key = PINECONE_API_KEY)
#pinecone.init(api_key=PINECONE_API_KEY, environment="us-east-1-aws")
index_name = "news-articles"

index = pc.Index(index_name)

embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

documents_file = "news-sample-1.jsonl"
with open(documents_file, 'r') as file:
    data = [json.loads(line) for line in file]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

chunks_with_metadata = []
print("docs loaded.")
for doc in data:
    metadata = {k: doc[k] for k in doc}
    chunks = text_splitter.split_text(doc["content"])
    
    for chunk in chunks:
        chunks_with_metadata.append({
            "text": chunk,
            "metadata": metadata
        })
print("chunks created.")

for chunk_data in chunks_with_metadata:
    chunk_text = chunk_data["text"]
    metadata = chunk_data["metadata"]

    try:
        chunk_embedding = embedding_model.encode(chunk_text).tolist()
        
        #Storing the vector in Pinecone
        index.upsert(vectors=[{
            "id": f"{metadata['title']}_{hash(chunk_text)}",
            "values": chunk_embedding,
            "metadata": metadata
        }])
    except Exception as e:
        print(f"Skipping chunk due to metadata size limit: {e}")
        continue

print("Chunks and metadata have been successfully stored in Pinecone.")


docs loaded.
chunks created.
Skipping chunk due to metadata size limit: UNKNOWN:Error received from peer  {created_time:"2024-11-01T02:50:57.86133232+00:00", grpc_status:3, grpc_message:"Metadata size is 53722 bytes, which exceeds the limit of 40960 bytes per vector"}
Skipping chunk due to metadata size limit: UNKNOWN:Error received from peer  {grpc_message:"Metadata size is 53722 bytes, which exceeds the limit of 40960 bytes per vector", grpc_status:3, created_time:"2024-11-01T02:50:57.89564727+00:00"}
Skipping chunk due to metadata size limit: UNKNOWN:Error received from peer  {created_time:"2024-11-01T02:50:57.928909414+00:00", grpc_status:3, grpc_message:"Metadata size is 53722 bytes, which exceeds the limit of 40960 bytes per vector"}
Skipping chunk due to metadata size limit: UNKNOWN:Error received from peer  {grpc_message:"Metadata size is 53722 bytes, which exceeds the limit of 40960 bytes per vector", grpc_status:3, created_time:"2024-11-01T02:50:57.962014533+00:00"}
Skipping 

In [35]:
# query = "Give me news about machine learning"
# query = "Give me news about machine learning from the source MyInforms"
query = "Give me news about machine learning from the source MyInforms about the company Baidu"
query_embedding = embedding_model.encode(query).tolist()
top_k = 5
similar_docs = index.query(vector = query_embedding, top_k = top_k, include_metadata = True)

print("Top similar documents are:------>")
context = ""
for match in similar_docs["matches"]:
    print(f"ID: {match['id']}, Score: {match['score']}")
    context = context + match["metadata"]["content"] + "\n";
    print("Metadata:", match["metadata"]["content"])
    print("-----------------------------------------------------------------------------------")

# print(context)


Top similar documents are:------>
ID: Baidu take on Siri and Google Now with Duer AI assistant_3243387392327520846, Score: 0.66172814
Metadata: Chinese giant Baidu is getting in on the phone personal assistant game with the launch of Duer, marking a major improvement on the previous system launched on the Baidu app three years ago.
 he post appeared first on Silicon  
Chinese giant Baidu is getting in on the phone personal assistant game with the launch of Duer, marking a major improvement on the previous system launched on the Baidu app three years ago. 

Baidu s Duer, which effectively translates to  Du Secretary  was given a major demonstration with aims of using artificial intelligence (AI) and machine learning to challenge the systems developed by Apple and Google. 

Much has been made of its development considering that one of the driving forces behind its machine learning technology is world-renowned expert in the field, Andrew Ng, who joined the company back in 2014. 

Much lik

In [36]:
import ollama

# print(context)

prompt = f"""
**###IMPORTANT**: Restrict yourself to the provided context and do not include any information that is not present in the context.
You are an expert in answering queries by using the provided context. Strictly use the provided context and query to create a news summary.
Follow the procdure below:-
- while generating each sentence, verify whether the generated content does not have any information which is not present in the context.
query: {query}
context: {context}"""


stream = ollama.chat(
    model='llama3.1',
    messages=[{'role': 'user', 'content': prompt}],
    stream=True,
)

response_content = ''
for chunk in stream:
    response_content += chunk['message']['content']

print("--------------------------------------------------------------")
print(response_content)




--------------------------------------------------------------
There are two articles here. I'll summarize each one:

**Article 1: Apple's Machine Learning Efforts**

Apple is trying to challenge Google's popular Google Now service by improving Siri's capabilities through machine learning. To do this, Apple is courting key researchers from Google, Facebook, Amazon, and other companies. The goal is to improve Siri's various functions, such as anticipating what users want to do. However, this may require Apple to compromise its commitment to consumer privacy.

**Article 2: Quantcast Expands in Texas**

Quantcast, a big data company, is expanding its presence in the US with a new office in Austin, Texas. The company processes real-time data and provides insights for brands and publishers. With over 600 employees, Quantcast aims to grow quickly and cost-effectively, driving top-line revenue and margin.

**Article 3: Baidu Launches Phone Personal Assistant**

Baidu, the Chinese giant, has l

In [37]:
from sklearn.metrics.pairwise import cosine_similarity

response_embedding = embedding_model.encode(response_content).tolist()

cosine_sim = cosine_similarity([query_embedding], [response_embedding])

print(f"Cosine Similarity: {cosine_sim[0][0]}")


Cosine Similarity: 0.5146266939736258


In [42]:

Cosine Similarity of query 1 : 0.37691730043795424
Cosine Similarity of query 2: 0.3240980821559353
Cosine Similarity of query 3: 0.5146266939736258

SyntaxError: invalid syntax (2837872663.py, line 1)