In [1]:
# #@ Installing dependencies
# !pip install --user langchain
# !pip install --user wikipedia
# !pip install --user openai

In [2]:
# %pip install --upgrade --quiet  wikipedia
# %pip install --user tiktoken

# Getting Wikipedia data

In [3]:
#@ Loading data from openai wikipedia page
from langchain_community.document_loaders import WikipediaLoader
from langchain.text_splitter import CharacterTextSplitter

data = WikipediaLoader(query="openai").load()

## Creating Chunks
splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, chunk_overlap=20)                     # character splitter
splits = splitter.split_documents(data)



In [4]:
#@ deleting summary data
for split in splits:
  del split.metadata["summary"]

In [5]:
#@ Inspecting source
for split in splits:
  print(split.metadata["source"])

https://en.wikipedia.org/wiki/OpenAI
https://en.wikipedia.org/wiki/OpenAI_Codex
https://en.wikipedia.org/wiki/ChatGPT
https://en.wikipedia.org/wiki/GPT-3
https://en.wikipedia.org/wiki/Generative_pre-trained_transformer
https://en.wikipedia.org/wiki/Greg_Brockman
https://en.wikipedia.org/wiki/Ilya_Sutskever
https://en.wikipedia.org/wiki/Sam_Altman
https://en.wikipedia.org/wiki/GPT-4
https://en.wikipedia.org/wiki/DALL-E
https://en.wikipedia.org/wiki/OpenAI_Five
https://en.wikipedia.org/wiki/Mira_Murati
https://en.wikipedia.org/wiki/GitHub_Copilot
https://en.wikipedia.org/wiki/Emmett_Shear
https://en.wikipedia.org/wiki/Removal_of_Sam_Altman_from_OpenAI
https://en.wikipedia.org/wiki/Whisper_(speech_recognition_system)
https://en.wikipedia.org/wiki/Gemini_(language_model)
https://en.wikipedia.org/wiki/GPT-2
https://en.wikipedia.org/wiki/Auto-GPT
https://en.wikipedia.org/wiki/Helen_Toner
https://en.wikipedia.org/wiki/Bard_(chatbot)
https://en.wikipedia.org/wiki/Reinforcement_learning_from_hu

In [6]:
#@ removing irrelevant sources
splits.remove(splits[1])
splits.remove(splits[11])
splits.remove(splits[12])



# Generating summaries of data and relationship extraction form database

In [7]:
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
# Initialize the text splitter
rtext_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=150)

# Initialize LLM
llm = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo", openai_api_key="")                            # enter your api key here.

# Define the map prompt template
map_template = """The following is a set of documents
{all_data}
Based on this list of docs, please perform concise summaries while extracting essential relationships for relationships analysis later, please do include dates of actions or events, which are very important for timeline analysis later. Example: "Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)", which showcases not only the relationship between Sam and OpenAI, but also when it happens.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)

# Define the map_chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)

all_data = splits

# Extract text from each document
all_text_data = [split.page_content for split in splits]

# Reduce
reduce_template = """The following is set of summaries:
{all_data}
Take these and distill it into concise summaries of the articles while containing important relationships and events (including the timeline). Example: "Sam gets fired by the OpenAI board on 11/17/2023 or (Nov. 17th, Friday)", which showcases not only the relationship between Sam and OpenAI, but also when it happens.
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)

# ChatPromptTemplate(input_variables=['all_data'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['all_data'], template='The following is a set of documents:\n{all_data}\nBased on this list of docs, please identify the main themes \nHelpful Answer:'))])

# Run chain
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain,
    document_variable_name="all_data"  # This should match the variable name in reduce_prompt
)

# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=4000,
)

# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
    # Map chain
    llm_chain=map_chain,
    # Reduce chain
    reduce_documents_chain=reduce_documents_chain,
    # The variable name in the llm_chain to put the documents in
    document_variable_name="all_data",
    # Return the results of the map steps in the output
    return_intermediate_steps=False,
)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, chunk_overlap=0
)
split_docs = text_splitter.split_documents(all_data)

# Run the MapReduce Chain
summarization_results = map_reduce_chain.run(split_docs)

  warn_deprecated(
  warn_deprecated(


In [8]:
length = 0
for text in all_text_data:
    length += len(text)
print(length)                                               # length before summarization

78462


In [9]:
len(summarization_results)

2397

In [10]:
summarization_results                                      # length after sumamrization

"1. OpenAI, a U.S.-based AI research organization, was founded in December 2015 with the goal of developing safe and beneficial artificial general intelligence. It consists of the non-profit OpenAI, Inc. and its for-profit subsidiary OpenAI Global, LLC.\n2. Microsoft made significant investments in OpenAI, providing $1 billion in 2019 and $10 billion in 2023, including compute resources on Microsoft's Azure cloud service.\n3. On November 17, 2023, Sam Altman was fired by the OpenAI board, but later returned as CEO. Greg Brockman was also removed as chairman and resigned as president, but returned to the company. Most of the board members resigned, and Bret Taylor became the new chairman.\n4. OpenAI developed GPT-3, a large language model released in 2020 with 175 billion parameters. GPT-4 was released in March 2023.\n5. OpenAI launched ChatGPT, a chatbot, on November 30, 2022, and gained over 100 million users by January 2023. It contributed to OpenAI's valuation of $29 billion.\n6. Op

In [11]:
#@ Storing sumamrization results
with open('summary.txt', 'w') as file:
    file.write(str(summarization_results))

# Extracting entities and relationships for knowledge graph

In [12]:
# # Integrates Large Language Models (LLMs) into spaCy pipelines, featuring a modular system for fast prototyping and prompting
# !pip install spacy-llm

In [13]:
!pip install --user spacy




In [25]:
from spacy.cli import download
download('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [26]:
download('en_core_web_sm')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [47]:
import os
import json
import spacy
from collections import Counter
from pathlib import Path
from wasabi import msg
from spacy_llm.util import assemble
os.environ["OPENAI_API_KEY"] = ""                                           # insert your api key here
# traditional spacy NER (Named Recognition Library)
def split_document_sent(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents] # referencial

# spacy-llm relationship extraction
def process_text(nlp, text, verbose=False):
    doc = nlp(text)
    if verbose:
        msg.text(f"Text: {doc.text}")
        msg.text(f"Entities: {[(ent.text, ent.label_) for ent in doc.ents]}")
        msg.text("Relations:")
        for r in doc._.rel:
            msg.text(f"  - {doc.ents[r.dep]} [{r.relation}] {doc.ents[r.dest]}")
    return doc

def run_pipeline(config_path, examples_path=None, verbose=False):
    if not os.getenv("OPENAI_API_KEY"):
        msg.fail("OPENAI_API_KEY env variable was not found. Set it and try again.", exits=1)

    nlp = assemble(config_path, overrides={} )

    # Initialize counters and storage
    processed_data = []
    entity_counts = Counter()
    relation_counts = Counter()

    # Load your articles and news data here
    # all_data = news_articles_data + documents

    sents = split_document_sent(summarization_results)
    for sent in sents:
        doc = process_text(nlp, sent, verbose)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        relations = [(doc.ents[r.dep].text, r.relation, doc.ents[r.dest].text) for r in doc._.rel]
        
        # Store processed data
        processed_data.append({'text': doc.text, 'entities': entities, 'relations': relations})

        # Update counters
        entity_counts.update([ent[1] for ent in entities])
        relation_counts.update([rel[1] for rel in relations])

    # Export to JSON
    with open('processed_data.json', 'w') as f:
        json.dump(processed_data, f)

    # Display summary
    msg.text(f"Entity counts: {entity_counts}")
    msg.text(f"Relation counts: {relation_counts}")

# Set your configuration paths and flags
config_path = Path("zeroshot.cfg")
examples_path = None  # or None if not using few-shot
verbose = True

# Run the pipeline
file = run_pipeline(config_path, None, verbose)

Text: 1[ENT0:CARDINAL]. OpenAI, a U.S.-based AI[ENT1:ORG] research organization,
was founded in December 2015[ENT2:DATE] with the goal of developing safe and
beneficial artificial general intelligence.
Entities: [('1', 'CARDINAL'), ('AI', 'ORG'), ('December 2015', 'DATE')]
Relations:
  - 1 [ORG] AI
  - 1 [DATE] December 2015
Text: It consists of the non-profit OpenAI, Inc.[ENT0:ORG] and its for-profit
subsidiary OpenAI Global, LLC[ENT1:ORG].
Entities: [('OpenAI, Inc.', 'ORG'), ('OpenAI Global, LLC', 'ORG')]
Relations:
  - OpenAI, Inc. [subsidiary of] OpenAI Global, LLC
Text: 2[ENT0:CARDINAL].
Entities: [('2', 'CARDINAL')]
Relations:
Text: Microsoft[ENT0:ORG] made significant investments in OpenAI[ENT1:ORG],
providing $1 billion[ENT2:MONEY] in 2019[ENT3:DATE] and $10 billion[ENT4:MONEY]
in 2023[ENT5:DATE], including compute resources on Microsoft[ENT6:ORG]'s
Azure[ENT7:ORG] cloud service.
Entities: [('Microsoft', 'ORG'), ('OpenAI', 'ORG'), ('$1 billion', 'MONEY'),
('2019', 'DATE'), ('$1