## Synthetic data creation (simplified)

#### Dependencies

In [1]:
# install requirements
%%bash
pip install --upgrade pip -q
pip install transformers~=4.37.2
pip install huggingface_hub~=0.20.3
pip install datasets~=2.16.1
pip install scikit-learn
pip install pandas
pip install tqdm
pip install python-dotenv
pip install vllm kaleido python-multipart typing-extensions==4.5.0 torch==2.1.0
pip install openai
pip install megablocks
pip install accelerate

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 26.0 MB/s eta 0:00:00
Collecting transformers~=4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 129.4/129.4 kB 2.6 MB/s eta 0:00:00
Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.4/8.4 MB 89.8 MB/s eta 0:00:00
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.2
    Uninstalling transformers-4.38.2:
      Successfully uninstalled transformers-4.38.2
Successfully installed transformers-4.37.2
Collecting datasets~=2.16.1
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets~=2.16.1)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets~=2.16.1)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.met

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sqlalchemy 2.0.28 requires typing-extensions>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible.
pydantic-core 2.16.3 requires typing-extensions!=4.7.0,>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
torchvision 0.17.1+cu121 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.2.1+cu121 requires torch==2.2.1, but you have torch 2.1.0 which is incompatible.
torchtext 0.17.1 requires torch==2.2.1, b

### Import

In [1]:
import os
from tqdm import tqdm
import ast
import numpy as np
import pandas as pd
import random
import json
from datetime import datetime
import os
import requests
from datasets import load_dataset
import random

print("Notebook running")

Notebook running


In [2]:
# login via the huggingface hub with you hf_token
# you need a huggingface account and create a token here: https://huggingface.co/settings/tokens
# we can then call on the token with huggingface_hub.get_token()
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from google.colab import drive
drive.mount('/content/drive')
if not os.path.exists("/content/drive/MyDrive/new_summary"):
  os.mkdir("/content/drive/MyDrive/new_summary")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load and prepare dataset

In [4]:
SEED = 42
random.seed(SEED)

dataset = load_dataset("BEE-spoke-data/financial-news-articles-filtered")["train"]

# sample for faster generation
dataset = dataset.shard(num_shards=100, index=0)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
dataset

Dataset({
    features: ['title', 'text', 'url', 'word_count'],
    num_rows: 1998
})

In [6]:
dataset[0]

{'title': 'Global Markets: Asia shares reach decade top on China data, dollar in doldrums',
 'text': "NEW YORK (Reuters) - European stocks closed lower on Tuesday, the first trading day of 2018, while Wall Street advanced and the U.S. dollar fell to its weakest in over three months against key currencies.\nA trader works on the floor of the New York Stock Exchange shortly after the opening bell in New York, U.S., January 2, 2018. REUTERS/Lucas Jackson MSCI’s gauge of stocks across the globe .MIWD PUS gained 0.61 percent. The index had set scores of record highs and rose by one-fifth in value in 2017.\nMajor stock indexes closed 2017 with their best performance since 2013. In the U.S. market, the advance came amid strong economic growth and corporate earnings, low interest rates and hopes, now realized, of U.S. corporate tax cuts.\nU.S. equity indexes advanced on Tuesday, buoyed by gains in technology and consumer discretionary stocks.\nIncreases in Apple ( AAPL.O ), Facebook ( FB.O ), 

In [7]:
def full_article_text(example):
    example["full_article"] = example["title"] + "\n\n" + example["text"]
    return example

dataset = dataset.map(full_article_text)

print(dataset)

Dataset({
    features: ['title', 'text', 'url', 'word_count', 'full_article'],
    num_rows: 1998
})


### Simple Prompts / Instructions

In [8]:
prompt = """You are a highly qualified expert trained financial agent.

Your task is to generate a short summary of a financial news \
article published on a blog post. Please include all the important financial information in the summary.

Summarize the news article below, delimited by triple \
backticks, in at most 100 words.

Article: ```{article}```
Summary:
"""

from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
chat_financial = [{"role": "user", "content": prompt}]
prompt_financial = tokenizer.apply_chat_template(chat_financial, tokenize=False).replace("<s>", "")


In [9]:
prompt_financial

'[INST] You are a highly qualified expert trained financial agent.\n\nYour task is to generate a short summary of a financial news article published on a blog post. Please include all the important financial information in the summary.\n\nSummarize the news article below, delimited by triple backticks, in at most 100 words.\n\nArticle: ```{article}```\nSummary:\n [/INST]'

In [10]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1",device_map="cuda")

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

### Test simplified code for blog

In [11]:
# docs on different parameters: https://huggingface.co/docs/api-inference/detailed_parameters#text-generation-task
generation_params = dict(
    temperature=0.8,
    max_new_tokens=200,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

def generate_text(prompt=None, generation_params=None,model=model,tokenizer=tokenizer):
    encoded = tokenizer(prompt, return_tensors="pt")
    for k, v in encoded.items():
        encoded[k] = v.to("cuda")
    response = model.generate(**encoded,**generation_params)
    decoded = tokenizer.batch_decode(response,skip_special_tokens=True)
    return decoded[0].replace(prompt,"").strip()


In [12]:
output_simple = []
for text in tqdm(dataset["full_article"][:5]):
	# add text into the prompt template
    prompt_formatted = prompt_financial.format(article=text)
    # send text to API
    output = generate_text(
        prompt=prompt_formatted, generation_params=generation_params
    )
    output_simple.append(output.strip())


100%|██████████| 5/5 [00:41<00:00,  8.24s/it]


In [15]:
dataset["full_article"][0]

"Global Markets: Asia shares reach decade top on China data, dollar in doldrums\n\nNEW YORK (Reuters) - European stocks closed lower on Tuesday, the first trading day of 2018, while Wall Street advanced and the U.S. dollar fell to its weakest in over three months against key currencies.\nA trader works on the floor of the New York Stock Exchange shortly after the opening bell in New York, U.S., January 2, 2018. REUTERS/Lucas Jackson MSCI’s gauge of stocks across the globe .MIWD PUS gained 0.61 percent. The index had set scores of record highs and rose by one-fifth in value in 2017.\nMajor stock indexes closed 2017 with their best performance since 2013. In the U.S. market, the advance came amid strong economic growth and corporate earnings, low interest rates and hopes, now realized, of U.S. corporate tax cuts.\nU.S. equity indexes advanced on Tuesday, buoyed by gains in technology and consumer discretionary stocks.\nIncreases in Apple ( AAPL.O ), Facebook ( FB.O ), Alphabet ( GOOGL.O 

In [14]:
output_simple[0]

"The global stock markets reached their highest point in a decade in 2017, driven by strong economic growth, corporate earnings, low interest rates, and hopes of U.S. corporate tax cuts. The S&P 500, Dow Jones Industrial Average, and Nasdaq Composite all gained on Tuesday, with technology and consumer discretionary stocks leading the way. The pan-European STOXX 600 index fell slightly, while the Shanghai blue chips and MSCI's emerging market stock index rose. The dollar fell to its weakest in over three months against key currencies, with expectations of a slower pace of interest rate increases by the Federal Reserve and a tepid U.S. inflation picture. Other currencies, including the euro and Japanese yen, strengthened, while U.S. Treasury yields rose in line with European government yields. Oil prices fell slightly, while copper, gold, and other"

In [16]:
dataset["full_article"][1]

'OGSystems’ CEO to Transition to Chairman of the Board\n\nCompany Announces New Executive Leadership Team\nCHANTILLY, Va.--(BUSINESS WIRE)-- OGSystems, a leader in technology innovation for the Department of Defense (DoD) and Intelligence Community (IC), today announced that co-founder Omar Balkissoon will transition from Chief Executive Officer (CEO) to Chairman of the Board. Garrett Pagon, President and co-founder, will step into the role of CEO. Balkissoon will focus on further building the technology and organization of OGSystems’ recent tech start-up spinout, GeoSpark Analytics, serving as the new company’s CEO ( http://bizj.us/1pgb99 ). OGSystems also announced the recent promotion of Steve Martin as the Chief Operating Officer (COO) and Dan Ehrmantraut to Chief Financial Officer (CFO).\nThis press release features multimedia. View the full release here: http://www.businesswire.com/news/home/20180109005313/en/\nOGSystems announces new executive leadership team. (Pictured left to 

In [17]:
output_simple[1]

"OGSystems, a leader in technology innovation for the Department of Defense (DoD) and Intelligence Community (IC), announced that co-founder Omar Balkissoon will transition from Chief Executive Officer (CEO) to Chairman of the Board. Garrett Pagon, President and co-founder, will step into the role of CEO. Balkissoon will focus on further building the technology and organization of OGSystems' recent tech start-up spinout, GeoSpark Analytics, serving as the new company's CEO. OGSystems also announced the recent promotion of Steve Martin as the Chief Operating Officer (COO) and Dan Ehrmantraut to Chief Financial Officer (CFO)."

### SPR prompts

https://github.com/daveshap/**SparsePrimingRepresentations**

In [22]:
prompt = """# MISSION
You are a Sparse Priming Representation (SPR) writer. An SPR is a particular kind of use of language for advanced NLP, NLU, and NLG tasks, particularly useful for the latest generation of Large Language Models (LLMs). You will be given information by the USER which you are to render as an SPR.

# THEORY
LLMs are a kind of deep neural network. They have been demonstrated to embed knowledge, abilities, and concepts, ranging from reasoning to planning, and even to theory of mind. These are called latent abilities and latent content, collectively referred to as latent space. The latent space of an LLM can be activated with the correct series of words as inputs, which will create a useful internal state of the neural network. This is not unlike how the right shorthand cues can prime a human mind to think in a certain way. Like human minds, LLMs are associative, meaning you only need to use the correct associations to "prime" another model to think in the same way.

# METHODOLOGY
Render the input as a distilled list of succinct statements, assertions, associations, concepts, analogies, and metaphors. The idea is to capture as much, conceptually, as possible but with as few words as possible. Write it in a way that makes sense to you, as the future audience will be another language model, not a human. Use complete sentences.

Generate a SPR for the following article. Format output SPR as numbered list:
Input: ```{article}```
SPR:
"""

chat_financial_SPR = [{"role": "user", "content": prompt}]
prompt_financial_SPR = tokenizer.apply_chat_template(chat_financial_SPR, tokenize=False).replace("<s>", "")


In [19]:
output_spr = []
for text in tqdm(dataset["full_article"][:2]):
	# add text into the prompt template
    prompt_formatted = prompt_financial_SPR.format(article=text)
    # send text to API
    output = generate_text(
        prompt=prompt_formatted, generation_params=generation_params
    )
    output_spr.append(output.strip())


100%|██████████| 2/2 [00:18<00:00,  9.05s/it]


In [20]:
print(output_spr[0])

1. Global Markets:
	* Asia shares reached a decade top on China data.
	* European stocks closed lower on the first trading day of 2018.
	* Wall Street advanced and the U.S. dollar fell to its weakest in over three months against key currencies.
	* MSCI's gauge of stocks across the globe gained 0.61 percent.
	* Major stock indexes closed 2017 with their best performance since 2013.
	* U.S. equity indexes advanced on Tuesday, buoyed by gains in technology and consumer discretionary stocks.
	* Increases in Apple, Facebook, Alphabet, and Microsoft shares pulled the S&P 500 index higher on Tuesday.
	* The Dow Jones Industrial Average rose 59.79 points, or 0.24 percent, to 24


In [21]:
print(output_spr[1])

1. OGSystems CEO to transition to Chairman of the Board.
2. New executive leadership team announced.
3. Balkissoon to focus on GeoSpark Analytics as CEO.
4. Pagon to become CEO of OGSystems.
5. Martin and Ehrmantraut promoted to COO and CFO, respectively.
6. Balkissoon and Pagon co-founded OGSystems in 2005.
7. Pagon served as an intelligence officer for the US Air Force before founding OGSystems.
8. OGSystems has become a recognized and trusted prime contractor for the IC.
9. Pagon is responsible for scaling the company while maintaining its innovative spirit.
10. Martin has overseen increases in key metrics such as employee retention.
11. Ehrmantraut established the first in-house corporate finance and


# VLLM for concurrent Queries

In [1]:
from datasets import load_dataset
import random
from tqdm import tqdm
SEED = 42
random.seed(SEED)

dataset = load_dataset("BEE-spoke-data/financial-news-articles-filtered")["train"]

# sample for faster generation
dataset = dataset.shard(num_shards=100, index=0)
def full_article_text(example):
    example["full_article"] = example["title"] + "\n\n" + example["text"]
    return example

dataset = dataset.map(full_article_text)

print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset({
    features: ['title', 'text', 'url', 'word_count', 'full_article'],
    num_rows: 1998
})


In [2]:
from vllm import LLM, SamplingParams
import torch
llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.1", dtype=torch.bfloat16)
sampling_params = SamplingParams(temperature=0.01, max_tokens=4000)

INFO 03-18 01:13:09 llm_engine.py:73] Initializing an LLM engine with config: model='mistralai/Mistral-7B-Instruct-v0.1', tokenizer='mistralai/Mistral-7B-Instruct-v0.1', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, seed=0)
INFO 03-18 01:13:20 llm_engine.py:222] # GPU blocks: 9460, # CPU blocks: 2048


In [3]:
prompt = """# MISSION
You are a Sparse Priming Representation (SPR) writer. An SPR is a particular kind of use of language for advanced NLP, NLU, and NLG tasks, particularly useful for the latest generation of Large Language Models (LLMs). You will be given information by the USER which you are to render as an SPR.

# THEORY
LLMs are a kind of deep neural network. They have been demonstrated to embed knowledge, abilities, and concepts, ranging from reasoning to planning, and even to theory of mind. These are called latent abilities and latent content, collectively referred to as latent space. The latent space of an LLM can be activated with the correct series of words as inputs, which will create a useful internal state of the neural network. This is not unlike how the right shorthand cues can prime a human mind to think in a certain way. Like human minds, LLMs are associative, meaning you only need to use the correct associations to "prime" another model to think in the same way.

# METHODOLOGY
Render the input as a distilled list of succinct statements, assertions, associations, concepts, analogies, and metaphors. The idea is to capture as much, conceptually, as possible but with as few words as possible. Write it in a way that makes sense to you, as the future audience will be another language model, not a human. Use complete sentences.

Generate a SPR for the following article. Format output SPR as numbered list like "1." and only generate the output SPR do not generate anything else:
Input: ```{article}```
SPR:
"""
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
chat_financial_SPR = [{"role": "user", "content": prompt}]
prompt_financial_SPR = tokenizer.apply_chat_template(chat_financial_SPR, tokenize=False).replace("<s>", "")

In [4]:
batch_size = 32
current_batch = []
vllm_responses = []
for i,text in tqdm(enumerate(dataset["full_article"])):
	# add text into the prompt template
    prompt_formatted = prompt_financial_SPR.format(article=text)
    current_batch.append(prompt_formatted)
    if len(current_batch) == batch_size or i == len(dataset["full_article"]) - 1:
      vllm_responses.extend(llm.generate(current_batch,sampling_params,use_tqdm = False))
      current_batch = []

1998it [1:06:25,  1.99s/it]


In [6]:
from google.colab import drive
import os
drive.mount('/content/drive')
if not os.path.exists("/content/drive/MyDrive/new_summary"):
  os.mkdir("/content/drive/MyDrive/new_summary")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
def clean_response(x):
  return(x.outputs[0].text.strip())
df = dataset.to_pandas()
df["vllm_response"] = vllm_responses
df["vllm_response"] = df["vllm_response"].apply(clean_response)

In [17]:
df

Unnamed: 0,title,text,url,word_count,full_article,vllm_response
0,Global Markets: Asia shares reach decade top o...,NEW YORK (Reuters) - European stocks closed lo...,https://in.reuters.com/article/global-markets/...,782,Global Markets: Asia shares reach decade top o...,1. Global Markets:\na. Asia shares reach decad...
1,OGSystems’ CEO to Transition to Chairman of th...,Company Announces New Executive Leadership Tea...,http://www.cnbc.com/2018/01/09/business-wire-o...,689,OGSystems’ CEO to Transition to Chairman of th...,1. OGSystems CEO to transition to Chairman of ...
2,House fire kills seven children in UAE,DUBAI (Reuters) - A house fire killed seven ch...,https://www.reuters.com/article/us-emirates-fi...,193,House fire kills seven children in UAE\n\nDUBA...,1. House fire in UAE kills seven children.\n2....
3,How John Kelly’s White House made “build a wal...,Getty Images White House Chief of Staff John K...,https://www.cnbc.com/2018/01/21/how-john-kelly...,1428,How John Kelly’s White House made “build a wal...,"1. The White House is divided on immigration, ..."
4,"Stephen M. Cutler, Former General Counsel of J...",NEW YORK--(BUSINESS WIRE)-- Simpson Thacher & ...,http://www.cnbc.com/2018/01/31/business-wire-s...,634,"Stephen M. Cutler, Former General Counsel of J...","1. Stephen M. Cutler, former General Counsel o..."
...,...,...,...,...,...,...
1993,TherapeuticsMD to Host First Quarter Financial...,"BOCA RATON, Fla.--(BUSINESS WIRE)-- Therapeuti...",http://www.cnbc.com/2018/05/01/business-wire-t...,872,TherapeuticsMD to Host First Quarter Financial...,1. TherapeuticsMD to Host First Quarter Financ...
1994,Indian child killer sentenced to death 23 days...,Indian child killer sentenced to death 23 days...,https://www.reuters.com/video/2018/05/25/india...,148,Indian child killer sentenced to death 23 days...,"1. Naveen Gadke, an Indian man, was arrested o..."
1995,"Vertex Energy, Inc. Announces 2018 First Quart...",Revenue Rose 19% Year-Over-Year; Gross Profit ...,http://www.cnbc.com/2018/05/15/globe-newswire-...,2500,"Vertex Energy, Inc. Announces 2018 First Quart...","1,00000000000000000000000000000000000000000000..."
1996,Cryptocurrencies and blockchain are becoming a...,The cryptocurrency industry is getting so hot ...,https://www.cnbc.com/2018/05/04/cryptocurrenci...,570,Cryptocurrencies and blockchain are becoming a...,1. Cryptocurrencies and blockchain are becomin...


In [18]:
df.iloc[1995].vllm_response

'1,0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

In [24]:
df[df.vllm_response.apply(lambda x : len(x.split("\n")) == 1)]

Unnamed: 0,title,text,url,word_count,full_article,vllm_response
34,UMC Reports Fourth Quarter 2017 Results,Full-year foundry revenue in USD increased 7%Y...,http://www.cnbc.com/2018/01/24/business-wire-u...,5528,UMC Reports Fourth Quarter 2017 Results\n\nFul...,S
100,Central Valley Community Bancorp Reports Earni...,"FRESNO, Calif.--(BUSINESS WIRE)-- The Board of...",http://www.cnbc.com/2018/01/24/business-wire-c...,6946,Central Valley Community Bancorp Reports Earni...,Central
132,CNBC TRANSCRIPT: JPMORGAN CHASE CHAIRMAN & CEO...,"WHEN: Today, Wednesday, January 24, 2018\nWHER...",http://www.cnbc.com/2018/01/24/cnbc-transcript...,4866,CNBC TRANSCRIPT: JPMORGAN CHASE CHAIRMAN & CEO...,The THAT THAT THAT THAT THAT THAT THAT THAT TH...
281,LIVE MARKETS-Reasons to smile about European e...,Jan 22 (Reuters) - Welcome to the home for rea...,https://www.reuters.com/article/europe-stocks/...,2614,LIVE MARKETS-Reasons to smile about European e...,2
387,Broadwind Energy Announces Q4 and Full Year 20...,Highlights:\n2017 orders of $88 million includ...,http://www.cnbc.com/2018/02/27/globe-newswire-...,4779,Broadwind Energy Announces Q4 and Full Year 20...,S 1 2 1 1 2 2 2 1 2 1 1 1 1 1 2 1 1 1 1 1 1 1 ...
...,...,...,...,...,...,...
1907,Dream Alternatives Reports First Quarter Resul...,This press release contains forward-looking in...,http://www.cnbc.com/2018/05/07/globe-newswire-...,4864,Dream Alternatives Reports First Quarter Resul...,The information and
1927,Ctrip Reports Unaudited First Quarter of 2018 ...,"SHANGHAI, May 22, 2018 /PRNewswire/ -- Ctrip.c...",http://www.cnbc.com/2018/05/22/pr-newswire-ctr...,4338,Ctrip Reports Unaudited First Quarter of 2018 ...,The
1930,FMC Corporation Announces First Quarter 2018 R...,"PHILADELPHIA, May 2, 2018 /PRNewswire/ --\nFir...",http://www.cnbc.com/2018/05/02/pr-newswire-fmc...,3221,FMC Corporation Announces First Quarter 2018 R...,``````````````````````````````````````````````...
1946,AdvanSix Announces First Quarter 2018 Financia...,"Sales of $359 million, down 5% versus prior ye...",http://www.cnbc.com/2018/05/04/business-wire-a...,2559,AdvanSix Announces First Quarter 2018 Financia...,


In [25]:
df = df[df.vllm_response.apply(lambda x : len(x.split("\n")) > 1)]

In [26]:
df.to_csv("/content/drive/MyDrive/new_summary/financial_summaries.csv",index = False)

In [35]:
df.sample(1).vllm_response.iloc[0]

"1. Giants win 9-5 over Rockies in wild game.\n2. Belt and Hundley hit back-to-back homers in seventh inning.\n3. Giants salvage split of four-game series.\n4. Rockies kept from moving into first place in NL West.\n5. Gorkys Hernandez led off with a two-run homer in fifth.\n6. Longoria drew a one-out walk in seventh.\n7. McGee entered to face left-handed-hitting Belt.\n8. Belt homered on a 1-0 pitch for his first round-tripper against a southpaw this season.\n9. Hundley homered on a 3-2 pitch to add final blow.\n10. Longoria's sacrifice fly gave Giants a first-inning lead.\n11. Rockies scored twice apiece in fourth and fifth thanks to Story and Cuevas.\n12. Giants wasted no time beginning comeback after Hernandez's homer.\n13. Tomlinson gave San Francisco the lead with a two-RBI triple in sixth.\n14. Arenado tied the game again with an RBI single in top of seventh.\n15. Giants answered with winning rally in bottom of inning.\n16. Dyson earned the win despite giving up tying run in seve