In [1]:
!pip install pymupdf



In [2]:
import os
print(os.listdir('/content/'))

['.config', 'api.env', 'Lupin_India_2023.pdf', 'cleaned_texts', 'tokenized_data.jsonl', 'structured_data.jsonl', 'mistral_lupin_lora_final', 'extracted_texts', 'Lupin_India_2024.pdf', 'Lupin_India_2022.pdf', 'sample_data']


In [3]:
# pdf extraction

import fitz
def extract_text_from_pdf(pdf_path: str) -> str:
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
        return ""
def save_text_to_file(text: str, output_path: str):
    try:
        with open(output_path, "w", encoding="utf-8") as file:
            file.write(text)
        print(f"Extracted text saved to: {output_path}")
    except Exception as e:
        print(f"Error saving text to {output_path}: {e}")
if __name__ == '__main__':
    pdf_files = {
        "Lupin_India_2022": "/content/Lupin_India_2022.pdf",
        "Lupin_India_2023": "/content/Lupin_India_2023.pdf",
        "Lupin_India_2024": "/content/Lupin_India_2024.pdf"
    }

    output_dir = "extracted_texts"
    os.makedirs(output_dir, exist_ok=True)
    for name, pdf_path in pdf_files.items():
        extracted_text = extract_text_from_pdf(pdf_path)
        if extracted_text:
            output_file = os.path.join(output_dir, f"{name}.txt")
            save_text_to_file(extracted_text, output_file)
        else:
            print(f"No text extracted from {pdf_path}")

Extracted text saved to: extracted_texts/Lupin_India_2022.txt
Extracted text saved to: extracted_texts/Lupin_India_2023.txt
Extracted text saved to: extracted_texts/Lupin_India_2024.txt


In [4]:
# text_cleaning

import re
def clean_text(text: str) -> str:
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\f', '', text)
    text = re.sub(r'[^a-zA-Z0-9.,?!()\-\'\s]', '', text)
    return text.strip()
def clean_and_save_text(input_file: str, output_file: str):
    try:
        with open(input_file, "r", encoding="utf-8") as infile:
            raw_text = infile.read()
        cleaned_text = clean_text(raw_text)
        with open(output_file, "w", encoding="utf-8") as outfile:
            outfile.write(cleaned_text)
        print(f"Cleaned text saved to: {output_file}")
    except FileNotFoundError:
        print(f"Error: Input file not found: {input_file}")
    except Exception as e:
        print(f"Error processing {input_file}: {e}")
if __name__ == '__main__':
    input_dir = "extracted_texts"
    output_dir = "cleaned_texts"
    os.makedirs(output_dir, exist_ok=True)
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            input_file = os.path.join(input_dir, filename)
            output_file = os.path.join(output_dir, f"cleaned_{filename}")
            clean_and_save_text(input_file, output_file)
    print("Cleaning process complete!")

Cleaned text saved to: cleaned_texts/cleaned_Lupin_India_2024.txt
Cleaned text saved to: cleaned_texts/cleaned_Lupin_India_2022.txt
Cleaned text saved to: cleaned_texts/cleaned_Lupin_India_2023.txt
Cleaning process complete!


In [5]:
# data structuring

import os
from typing import List, Dict
def structure_data(text: str, year: int) -> Dict:
    cleaned_text = text
    try:
        start_index = cleaned_text.find("India")
        end_index = cleaned_text.find("Other regions")
        india_section = cleaned_text[start_index:end_index]

        if not india_section:
            print(f"Warning: No 'India' section found in report for {year}. Skipping.")
            return None
    except:
        print(f"Warning: Error while finding India section in report for {year}. Skipping.")
        return None
    return {"year": year, "text": india_section}
if __name__ == '__main__':
    input_dir = "cleaned_texts"
    years = [2022, 2023, 2024]
    structured_data = []
    file_index = 0
    for filename in os.listdir(input_dir):
        if filename.startswith("cleaned_") and filename.endswith(".txt"):
            input_file = os.path.join(input_dir, filename)
            try:
                with open(input_file, "r", encoding="utf-8") as infile:
                    cleaned_text = infile.read()
                year = years[file_index]
                file_index += 1
                structured_item = structure_data(cleaned_text, year)
                if structured_item:
                    structured_data.append(structured_item)
            except FileNotFoundError:
                print(f"Error: Input file not found: {input_file}")
            except IndexError:
                print("Error: More data files than years provided.")
                break
            except Exception as e:
                print(f"Error processing {input_file}: {e}")
    print("Structured data:")
    for item in structured_data:
        print(item)

Structured data:
{'year': 2022, 'text': 'India business has been a strong success story within the Indian Pharma Market (IPM) especially in the chronic and fast growing therapeutic areas. With sales of 60,759 Mn, this segment contributes 37 to Lupins overall sales, driving high profitability and creating substantial sustainable value for the company. Having a portfolio of high-quality and affordable drugs, coupled with a robust customer engagement strategy, Lupin is the preferred partner for medical practitioners across India. The companys achievements are exemplified by its sixth rank in the Indian Pharmaceutical Market (IPM) as of MAT March 2023. Notably, Lupins branded generics sales witnessed 6.5 increase in FY23, attaining a five-year compound annual growth rate (CAGR) of 10.4, surpassing the market CAGR of 9.9. These results have propelled Lupins market share to 3.45 in FY23. 80,000 70,000 60,000 50,000 40,000 30,000 FY17 39,675 43,506 49,324 54,531 56,651 65,241 69,668 (IQVIA re

In [6]:
# jsonl saving

import json
from typing import List, Dict
def save_to_jsonl(data: List[Dict], output_file: str):
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(json.dumps(item, ensure_ascii=False) + '\n')
        print(f"Successfully saved data to {output_file}")
    except Exception as e:
        print(f"Error saving to {output_file}: {e}")
if __name__ == '__main__':
    example_data = [{"year": 2022, "text": "Example data", "tokens": [1, 2, 3]}]
    output_file = "structured_data.jsonl"
    save_to_jsonl(example_data, output_file)
    print(f"Example data saved to {output_file}")

Successfully saved data to structured_data.jsonl
Example data saved to structured_data.jsonl


In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import os
print(os.path.exists('/content/structured_data.jsonl'))

True


In [9]:
if __name__ == '__main__':
    input_file = "structured_data.jsonl"
    output_file =  "tokenized_data.jsonl"

In [10]:
# tokenization
from transformers import AutoTokenizer
from typing import List, Dict
import torch
import os
import json

def tokenize_text(data: List[Dict], model_name: str = "mistralai/Mistral-7B-v0.1") -> List[Dict]:
    tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right")
    tokenizer.pad_token = tokenizer.eos_token
    for item in data:
        item["tokens"] = tokenizer(item["text"], truncation=True, padding=True, max_length=512, return_tensors="pt").input_ids
    return data

def save_tokenized_data(data: List[Dict], output_file: str):
    """Saves the tokenized data to a JSONL file, converting tensors to lists."""
    try:
        with open(output_file, 'w', encoding='utf-8') as outfile:
            for item in data:
                item['tokens'] = item['tokens'].tolist()
                json.dump(item, outfile, ensure_ascii=False)
                outfile.write('\n')
        print(f"Tokenized data saved to: {output_file}")
    except Exception as e:
        print(f"Error saving tokenized data: {e}")

if __name__ == '__main__':

    input_file = "structured_data.jsonl"
    output_file = "tokenized_data.jsonl"

    if not os.path.exists(input_file):
        print(f"ERROR: Input file '{input_file}' does NOT exist. Make sure data_structuring.py ran successfully.")
        exit()

    try:
        with open(input_file, 'r', encoding='utf-8') as infile:
            structured_data = [json.loads(line) for line in infile]
    except Exception as e:
        print(f"Error loading structured data from '{input_file}': {e}")
        structured_data = []

    if structured_data:
        tokenized_data = tokenize_text(structured_data)
        save_tokenized_data(tokenized_data, output_file)
    else:
        print("No data to tokenize (either file was empty or loading failed).")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenized data saved to: tokenized_data.jsonl


In [11]:
import sys
sys.path.insert(0, '/content/')

In [12]:
!pip install datasets
!pip install trl



In [13]:
# !pip uninstall -y bitsandbytes

In [13]:
!pip install -U bitsandbytes --no-cache-dir




In [14]:
import bitsandbytes as bnb
print("bitsandbytes Version:", bnb.__version__)


bitsandbytes Version: 0.45.3


In [15]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda)
print("CUDA Device:", torch.cuda.get_device_name(0))


CUDA Available: True
CUDA Version: 12.4
CUDA Device: Tesla T4


In [None]:
# import os
# os._exit(00)


In [16]:
!pip install peft



In [17]:
!pip install -U accelerate



In [17]:
# fine tuning

import torch
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model
import json

def load_data_from_jsonl(file_path: str, max_seq_length = 512):

    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            item = json.loads(line)
            item["tokens"] = torch.tensor(item["tokens"][:max_seq_length])
            data.append(item)
    return data

def fine_tune_mistral(model_name="mistralai/Mistral-7B-v0.1",
                      dataset_name="tokenized_data.jsonl",
                      output_dir="./mistral_lupin_lora_final"):

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    def load_data_from_jsonl(file_path: str, max_seq_length = 512):
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                item = json.loads(line)
                item["tokens"] = torch.tensor(item["tokens"][:max_seq_length])
                data.append(item)
        return data

    dataset = load_data_from_jsonl(dataset_name)
    dataset = Dataset.from_list(dataset)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)

    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        optim="paged_adamw_32bit",
        save_strategy="steps",
        save_steps=500,
        logging_steps=100,
        learning_rate=2e-4,
        max_steps=1000,
        fp16=True,
        report_to="none",
        push_to_hub=False
    )

    data_collator = DataCollatorForCompletionOnlyLM(
        response_template="Lupin Report: ",
        tokenizer=tokenizer
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
        peft_config=lora_config,
        data_collator=data_collator
    )

    trainer.train()

    trainer.save_model(output_dir)
    print(f"Fine-tuning complete. Model saved to {output_dir}")

if __name__ == "__main__":
    fine_tune_mistral()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Converting train dataset to ChatML:   0%|          | 0/1 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]



Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0




Fine-tuning complete. Model saved to ./mistral_lupin_lora_final


In [19]:
!pip install transformers



In [20]:
# testing model

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import gc
import warnings

warnings.filterwarnings("ignore")

def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()
    print("Cleared CUDA Memory!")

def generate_text(prompt, model_path="./mistral_lupin_lora_final", model_name="mistralai/Mistral-7B-v0.1"):
    clear_memory()
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    print("Loading base model with optimized memory...")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    print("Loading LoRA fine-tuned model...")
    try:
        model = PeftModel.from_pretrained(model, model_path)
        print("LoRA weights loaded successfully!")

        model = model.merge_and_unload()
        print("LoRA weights merged successfully!")

    except Exception as e:
        print(f"Error merging LoRA model: {e}")
        return None
    model = model.to(device)
    model.eval()
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    print("Generating text...")
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=inputs["input_ids"],
            max_new_tokens=200,
            do_sample=True,
            top_p=0.9,
            temperature=0.9,
            repetition_penalty=1.15,
        )
    output = tokenizer.decode(generation_output[0], skip_special_tokens=True)
    return output
if __name__ == "__main__":
    prompt = "Write a report about Lupin Limited"
    generated_text = generate_text(prompt)
    if generated_text:
        print("\n**Generated Report:**\n")
        print(generated_text)
    else:
        print("\n**Failed to generate text. Please check for LoRA model issues.**")


Cleared CUDA Memory!
Loading base model with optimized memory...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading LoRA fine-tuned model...
LoRA weights loaded successfully!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


LoRA weights merged successfully!
Generating text...

**Generated Report:**

Write a report about Lupin Limited. (MH 5)

## Introduction:

Lupin is one of the largest pharmaceutical companies based in India. The company was founded by Mr. Desh Bandhu Gujral in 1968, with just one tablet manufacturing machine in his rented garage. Today it has evolved into a multi-billion dollar organization and the fifth largest generics manufacturer across all countries combined. It employs over 20,300 people as on 31st March 2017 in its facilities spread globally, to help patients suffering from debilitating diseases live life to their fullest potential, through cutting edge medicine and high quality service delivery. Over the years, the group’s research efforts have resulted in an expansive portfolio of products with novel therapeutic indications and new routes/formulations that can transform treatment outcomes for many chronic and rare disease conditions. The Group continues to grow in the global m

In [18]:
!pip install peft



In [None]:
# !pip uninstall -y jax jaxlib bitsandbytes transformers accelerate peft

[0mFound existing installation: bitsandbytes 0.45.3
Uninstalling bitsandbytes-0.45.3:
  Successfully uninstalled bitsandbytes-0.45.3
Found existing installation: transformers 4.49.0
Uninstalling transformers-4.49.0:
  Successfully uninstalled transformers-4.49.0
Found existing installation: accelerate 1.4.0
Uninstalling accelerate-1.4.0:
  Successfully uninstalled accelerate-1.4.0
Found existing installation: peft 0.14.0
Uninstalling peft-0.14.0:
  Successfully uninstalled peft-0.14.0


In [21]:
!pip install requests beautifulsoup4 python-dotenv



In [22]:
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv

In [23]:
#  SerpAPI key
load_dotenv()
SERPAPI_API_KEY = os.getenv("454937d3dbe6df9f8379a0e138a1a77d1ebd922775e7e95e3f764f2cc01226e5")


In [24]:
pip install requests python-dotenv




In [25]:

dotenv_path = "/content/api.env"
load_dotenv(dotenv_path)

SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")

if SERPAPI_API_KEY is None:
    print("Error: SERPAPI_API_KEY not found. Make sure the .env file is uploaded and loaded correctly.")


In [26]:

load_dotenv()
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")

def scrape_google_news(query="Lupin Limited", serpapi_key=SERPAPI_API_KEY, num_results=5):

    url = "https://serpapi.com/search.json"
    params = {
        "engine": "google_news",
        "q": query,
        "api_key": serpapi_key,
        "num": num_results
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        results = response.json()

        articles = []
        if "news_results" in results:
            for entry in results["news_results"]:
                articles.append({
                    "title": entry.get("title", "N/A"),
                    "link": entry.get("link", "N/A"),
                    "source": entry.get("source", "N/A"),
                    "date": entry.get("date", "N/A"),
                    "snippet": entry.get("snippet", "N/A")
                })
        else:
            print("No news results found.")

        return articles
    except requests.exceptions.RequestException as e:
        print(f"Error scraping Google News: {e}")
        return []

news_articles = scrape_google_news()
for article in news_articles:
    print(article)


{'title': 'Lupin Ltd spurts 0.59%, up for five straight sessions', 'link': 'https://www.business-standard.com/markets/capital-market-news/lupin-ltd-spurts-0-59-up-for-five-straight-sessions-125030600442_1.html', 'source': {'name': 'Business Standard', 'icon': 'https://lh3.googleusercontent.com/AT8iZRL6eUcqE6QveDcDvxkWRmUmyT_OzkkVmZ20YRlylJqSMypckPekPjhINXiSTA47GzKicg'}, 'date': '03/06/2025, 07:30 AM, +0000 UTC', 'snippet': 'N/A'}
{'title': 'Investors Still Waiting For A Pull Back In Lupin Limited (NSE:LUPIN)', 'link': 'https://simplywall.st/stocks/in/pharmaceuticals-biotech/nse-lupin/lupin-shares/news/investors-still-waiting-for-a-pull-back-in-lupin-limited-nse', 'source': {'name': 'Simply Wall St', 'icon': 'https://encrypted-tbn0.gstatic.com/faviconV2?url=https://simplywall.st&client=NEWS_360&size=96&type=FAVICON&fallback_opts=TYPE,SIZE,URL'}, 'date': '03/04/2025, 12:54 AM, +0000 UTC', 'snippet': 'N/A'}
{'title': 'Wilson Sonsini Advises Aytu BioPharma in Exclusive Agreement with Lupin

In [27]:
!pip install transformers -y
!pip install --no-cache-dir transformers



Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y


In [29]:
from transformers.pipelines import pipeline


In [30]:
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-125M")

Device set to use cuda:0


In [34]:
torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=bnb_config,
    device_map="balanced_low_0",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
output = generator(
    "Lupin Limited is a leading pharmaceutical company known for",
    max_length=30,
    truncation=True,
    pad_token_id=tokenizer.eos_token_id
)

print(output)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


[{'generated_text': 'Lupin Limited is a leading pharmaceutical company known for its innovative and affordable medicines. The company has a strong presence in India and'}]


In [32]:
pip install -U --no-cache-dir bitsandbytes




In [33]:
import bitsandbytes as bnb
print(f"bitsandbytes version: {bnb.__version__}")


bitsandbytes version: 0.45.3


In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    use_auth_token=True
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from accelerate import infer_auto_device_map
from transformers import AutoModelForCausalLM

# ✅ Load the model first
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# ✅ Now infer the device map from the model object
device_map = infer_auto_device_map(model)

# ✅ Reload model with the correct device map
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map=device_map
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [45]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = generator(
    "Lupin Limited is a leading pharmaceutical company known for",
    max_length=50,
    truncation=True,
    pad_token_id=tokenizer.eos_token_id
)

print(output)


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
import torch
torch.cuda.empty_cache()


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch

torch.cuda.empty_cache()

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    quantization_config=bnb_config,
    device_map="sequential",
    torch_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

output = generator(
    "Lupin Limited is a leading pharmaceutical company known for",
    max_length=30,
    truncation=True,
    pad_token_id=tokenizer.eos_token_id
)

print(output)


In [None]:
from transformers import pipeline

generator = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-v0.1",
    device_map="auto"
)

output = generator(
    "Lupin Limited is a leading pharmaceutical company known for",
    max_length=50,
    truncation=True,
    pad_token_id=generator.tokenizer.eos_token_id
)

print(output)


In [None]:
# report_generator.py

from transformers import pipeline
import torch
import json

def load_generator(model_path="./mistral_lupin_lora_final"):
    """Loads the fine-tuned text generation pipeline."""
    generator = pipeline(
        "text-generation",
        model=model_path,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    return generator

def generate_report_section(generator, prompt, max_length=500, num_return_sequences=1):
    """Generates a report section based on a given prompt."""
    try:
        generated_text = generator(
            prompt,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            eos_token_id=generator.tokenizer.eos_token_id,
            pad_token_id=generator.tokenizer.pad_token_id
        )[0]["generated_text"]
        return generated_text
    except Exception as e:
        print(f"Error generating text: {e}")
        return ""

def format_report_section(text, section_title):
    """Formats the generated text into a structured report section."""
    formatted_text = f"## {section_title}\n\n{text}\n\n"
    return formatted_text

def create_lupin_style_prompt(topic, real_time_data):
    """Creates a prompt that incorporates Lupin's brand tone and real-time data."""
    prompt = f"""
    You are a highly skilled writer for Lupin Limited.
    Write a section for Lupin's Integrated Report 2024-25 on the topic of '{topic}'.
    Maintain a professional, informative, and optimistic tone, reflecting Lupin's commitment to innovation and patient care.
    Incorporate the following recent news and updates: {real_time_data}
    Focus on the India section of the report.
    """
    return prompt

def main(web_scraper): # added web_scraper
    """Main function to generate and format the report section."""
    generator = load_generator()
    scraped_data = web_scraper.get_scraped_data()  # Get data from web_scraper module
    real_time_data_summary = web_scraper.summarize_scraped_data(scraped_data) # Summarize scraped data
    lupin_prompt = create_lupin_style_prompt("India Market Overview", real_time_data_summary) #Add prompt
    generated_section = generate_report_section(generator, lupin_prompt)
    formatted_section = format_report_section(generated_section, "India Market Overview") #Format section
    print(formatted_section)
    return formatted_section

# Example execution:
if __name__ == "__main__":
    import web_scraper #Import web_scraper
    main(web_scraper)

In [None]:
# report_publisher.py

from googleapiclient.discovery import build
from google.oauth2 import service_account
import os
from dotenv import load_dotenv
#from hootsuite import HootSuite # Hootsuite is depricated so avoiding it
import json
import os
from google.oauth2 import credentials

load_dotenv()

def upload_to_google_docs(report_content, document_title="Lupin Integrated Report 2024-25", service_account_file="path/to/your/service_account.json"):
    """Uploads the final report to Google Docs.

    Args:
        report_content (str): The content of the report.
        document_title (str): The title of the Google Docs document.
        service_account_file (str): Path to the service account JSON file.

    Returns:
        str: The URL of the created Google Docs document.
    """
    # Authenticate with Google Docs API using a service account
    SCOPES = ['https://www.googleapis.com/auth/documents']
    creds = service_account.Credentials.from_service_account_file(
        service_account_file, scopes=SCOPES)

    try:
        service = build('docs', 'v1', credentials=creds)
        # Create a new Google Docs document
        document = service.documents().create(body={'title': document_title}).execute()
        doc_id = document.get('documentId')

        # Insert the report content into the document
        requests = [
            {
                'insertText': {
                    'location': {
                        'index': 1
                    },
                    'text': report_content
                }
            }
        ]
        result = service.documents().batchUpdate(documentId=doc_id, body={'requests': requests}).execute()

        doc_url = f"https://docs.google.com/document/d/{doc_id}"
        print(f"Report uploaded to Google Docs: {doc_url}")
        return doc_url

    except Exception as e:
        print(f"Error uploading to Google Docs: {e}")
        return None

# 5.2: Auto-Posting on LinkedIn (LinkedIn API)

# **NOTE**: Posting to LinkedIn automatically requires careful setup with the LinkedIn API. It's complex and requires a LinkedIn Developer account, application setup, and OAuth 2.0 authentication.

def main(report_content):
    """Main function to publish the report.

    Args:
        report_content (str): The full report content to be published.
    """

    # 1. Upload to Google Docs
    google_docs_url = upload_to_google_docs(report_content)

    if google_docs_url:
        print("Report publishing process started...")
    else:
        print("Google Docs upload failed.  Stopping the process.")
        return

    print("Report publishing process completed.")

if __name__ == "__main__":
    # Example usage:  Replace with the actual report content
    example_report_content = "This is a sample report content.  Replace this with the actual generated report."
    main(example_report_content)