In [None]:
!pip install datasets
import os
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Set environment variable for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Domain-wise input and output directories
input_directories = [
    "/content/drive/MyDrive/new dataset/dataset_v2/CLOUD/CLOUD/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/DL (1)/DL/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/cybersecurity/cybersecurity/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/iot/iot/iot/iot/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/ml/input"
]

output_directories = [
    "/content/drive/MyDrive/new dataset/dataset_v2/CLOUD/CLOUD/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/DL (1)/DL/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/cybersecurity/cybersecurity/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/iot/iot/iot/iot/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/ml/output"
]

# Function to load datasets from multiple domains
def load_domain_datasets(input_directories, output_directories):
    input_texts = []
    summaries = []

    for input_dir, output_dir in zip(input_directories, output_directories):
        input_files = sorted(os.listdir(input_dir))
        output_files = sorted(os.listdir(output_dir))

        for input_file, output_file in zip(input_files, output_files):
            input_file_path = os.path.join(input_dir, input_file)
            output_file_path = os.path.join(output_dir, output_file)

            with open(input_file_path, 'r', encoding='utf-8') as f:
                input_texts.append(f.read())

            with open(output_file_path, 'r', encoding='utf-8') as f:
                summaries.append(f.read())

    return input_texts, summaries

# Load datasets from the specified directories
input_texts, summaries = load_domain_datasets(input_directories, output_directories)

# Load T5 Base Model and Tokenizer
model_name = "t5-base"  # Updated to "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['input_texts'], truncation=True, padding='max_length', max_length=512)

train_encodings = tokenize_function({'input_texts': input_texts})
train_labels = tokenize_function({'input_texts': summaries})

# Create Dataset
dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

# Split the dataset into 80% training and 20% validation
train_test_split = dataset.train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/new dataset/model_t5_v2",  # Specify the directory to save model outputs
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,  # Run for 10 epochs
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Use the 20% of the dataset for evaluation
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("/content/drive/MyDrive/new dataset/model_t5_v2")
tokenizer.save_pretrained("/content/drive/MyDrive/new dataset/model_t5_v2")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,1.2201,1.408535
2,1.6451,1.350604
3,1.6342,1.32387
4,1.1429,1.308288
5,1.2414,1.3034
6,1.2789,1.296958
7,1.131,1.2971
8,1.2041,1.293723
9,1.4929,1.296238
10,1.0645,1.29575


('/content/drive/MyDrive/new dataset/model_t5_v2/tokenizer_config.json',
 '/content/drive/MyDrive/new dataset/model_t5_v2/special_tokens_map.json',
 '/content/drive/MyDrive/new dataset/model_t5_v2/spiece.model',
 '/content/drive/MyDrive/new dataset/model_t5_v2/added_tokens.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber

# Import libraries
import pdfplumber
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import re

# Load the model and tokenizer
model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"  # Ensure the model path is correct
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Define bounding boxes for left and right columns
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            # Extract text from left column
            left_text = page.within_bbox(left_bbox).extract_text()

            # Extract text from right column
            right_text = page.within_bbox(right_bbox).extract_text()

            # Combine text from both columns
            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'  # Adding a newline for separation
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    # Remove headers and footers (assuming they are at the start and end of pages)
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)

    # Remove references section (assuming it starts with "References" or similar)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)

    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    # Regex to find sections based on title and extract the following text until the next section
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text and generate summaries
def chunk_text(text, max_length=512):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

def generate_short_summary(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,  # Adjust for shorter summaries
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to summarize large text and generate short summary
def summarize_large_text(text):
    chunks = chunk_text(text)

    # Generate short summaries for each chunk
    short_summaries = [generate_short_summary(tokenizer.decode(chunk)) for chunk in chunks]

    # Combine chunks into one short summary
    combined_short_summary = ' '.join(short_summaries)

    return combined_short_summary

# Process PDF and generate summaries for specific sections
pdf_path = "/content/drive/MyDrive/I/TESTING PAPERS/ML_ 101.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')

# Combine the extracted sections into a single text
combined_text = f"{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

# Generate combined short summary
combined_short_summary = summarize_large_text(combined_text)

# Print the combined short summary
print("\nCombined Short Summary:\n", combined_short_summary)


Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-non




Combined Short Summary:
 This research investigates the use of Bayesian Network Model (BNM) to estimate the presence of estrous cycle in Japanese dairy cattle. Through the experiment with 280 Japanese anestrus Holstein dairy cows, it is found that the estimation for finding out estrous cycle represents almost 55% accuracy while considering all samples. On the contrary, almost 73% accurate estimation could be achieved while using suspended likelihood in sample datasets. The proposed model has more confidence than the estimation accuracy lies The estrous cycle of cattle is the period from one estrus (heat, phase of sexual receptivity) to the next estrus. For the cow and heifer, this period averages 21 days, with a typical range of 18 to 24 days in length [1, 2, 3, 4]. The estrous cycle is the period from one estrus (heat, phase of sexual receptivity) to the next estrus, with a typical range This study aims to find out the optimum factors to have an estrous cycle of bovine using a Bayesi

bart

In [None]:
!pip install datasets
import os
import torch
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset

# Set environment variable for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Domain-wise input and output directories
input_directories = [
    "/content/drive/MyDrive/new dataset/dataset_v2/CLOUD/CLOUD/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/DL (1)/DL/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/cybersecurity/cybersecurity/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/iot/iot/iot/iot/input",
    "/content/drive/MyDrive/new dataset/dataset_v2/ml/input"
]

output_directories = [
    "/content/drive/MyDrive/new dataset/dataset_v2/CLOUD/CLOUD/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/DL (1)/DL/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/cybersecurity/cybersecurity/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/iot/iot/iot/iot/output",
    "/content/drive/MyDrive/new dataset/dataset_v2/ml/output"
]

# Function to load datasets from multiple domains
def load_domain_datasets(input_directories, output_directories):
    input_texts = []
    summaries = []

    for input_dir, output_dir in zip(input_directories, output_directories):
        input_files = sorted(os.listdir(input_dir))
        output_files = sorted(os.listdir(output_dir))

        for input_file, output_file in zip(input_files, output_files):
            input_file_path = os.path.join(input_dir, input_file)
            output_file_path = os.path.join(output_dir, output_file)

            with open(input_file_path, 'r', encoding='utf-8') as f:
                input_texts.append(f.read())

            with open(output_file_path, 'r', encoding='utf-8') as f:
                summaries.append(f.read())

    return input_texts, summaries

# Load datasets from the specified directories
input_texts, summaries = load_domain_datasets(input_directories, output_directories)

# Load BART Base Model and Tokenizer
model_name = "facebook/bart-base"  # Updated to "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['input_texts'], truncation=True, padding='max_length', max_length=512)

train_encodings = tokenize_function({'input_texts': input_texts})
train_labels = tokenize_function({'input_texts': summaries})

# Create Dataset
dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

# Split the dataset into 80% training and 20% validation
train_test_split = dataset.train_test_split(test_size=0.2)

train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/new dataset/model_bart_v2",  # Specify the directory to save model outputs
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,  # Run for 10 epochs
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # Use the 20% of the dataset for evaluation
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained("/content/drive/MyDrive/new dataset/model_bart_v2")
tokenizer.save_pretrained("/content/drive/MyDrive/new dataset/model_bart_v2")




FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/new dataset/dataset_v2/CLOUD/CLOUD/input'

**Code of rouge of t5**

In [None]:
# Import libraries

!pip install rouge-score
!pip install pdfplumber
import pdfplumber
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import re
from rouge_score import rouge_scorer

# Load the model and tokenizer
model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"  # Ensure the model path is correct
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Define bounding boxes for left and right columns
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            # Extract text from left column
            left_text = page.within_bbox(left_bbox).extract_text()

            # Extract text from right column
            right_text = page.within_bbox(right_bbox).extract_text()

            # Combine text from both columns
            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'  # Adding a newline for separation
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    # Remove headers and footers (assuming they are at the start and end of pages)
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)

    # Remove references section (assuming it starts with "References" or similar)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)

    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    # Regex to find sections based on title and extract the following text until the next section
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text and generate summaries
def chunk_text(text, max_length=512):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

def generate_short_summary(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,  # Adjust for shorter summaries
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to summarize large text and generate short summary
def summarize_large_text(text):
    chunks = chunk_text(text)

    # Generate short summaries for each chunk
    short_summaries = [generate_short_summary(tokenizer.decode(chunk)) for chunk in chunks]

    # Combine chunks into one short summary
    combined_short_summary = ' '.join(short_summaries)

    return combined_short_summary

# Function to calculate ROUGE score
def calculate_rouge(predicted_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, predicted_summary)
    return scores

# Process PDF and generate summaries for specific sections
pdf_path = "/content/drive/MyDrive/CONIT2022Paper0627.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')

# Combine the extracted sections into a single text
combined_text = f"{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

# Generate combined short summary
combined_short_summary = summarize_large_text(combined_text)

# Print the combined short summary
print("\nCombined Short Summary:\n", combined_short_summary)

# Reference summary (you would provide the actual reference summary for comparison)
reference_summary = """
  identifying a genuine user profile on social media has gained significant importance
in the lieu of detecting social media users from cyber criminals.
With this regard, this paper is focussed at developing a
machine learning model that identifies and classifies user
profiles as genuine or not genuine categories. Three different datasets are considered
like facebook, instagram and twitter profiles for the
classification of Genuine user profiles in online social media
and networks. The three effective techniques discussed here
are SVM, Random forest and Neural Network. From the
above algorithms, Random forest achieves higher accuracy
in all three datasets compared with other algorithms.Instagram dataset highest accuracy is achieved which is
96%. Also Random forest gets an average accuracy of 95%
considering all three datasets.The Comparison result of various
classification algorithms like SVM, Neural Network and
Random forest on Instagram, Facebook and Twitter datasets
highlights that Random forest performed better with
accuracy 95%.
"""

# Calculate ROUGE scores
rouge_scores = calculate_rouge(combined_short_summary, reference_summary)

# Print the ROUGE scores
print("\nROUGE Scores:\n", rouge_scores)


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5b6a836993771fdd46d6ec72903dc727a5cc1f76969b478e9c330f9290f85ca3
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2





Combined Short Summary:
 The paper addresses the challenge of identifying and classifying genuine user profiles on online social networks (OSNs) as genuine or not genuine categories. The proposed model achieved an average accuracy of 94% in the classification task considering all three datasets. The results indicate that the proposed model achieved an average accuracy of 94% in the classification task considering all three datasets. Future work will focus on improving the classification of genuine user profiles on online social networks (OSNs). Social networking sites have become increasingly popular due to the rapid growth in their use. However, there are some problems with these sites, such as fake profiles and online impersonation. We need an automatic fake profile detection system to make the people using social networking sites feel safe. These are profiles of people or organizations that don’t actually exist or they impersonate other people. The reasons for creating fake profile

code for bart rouge(score)

In [None]:
# Install necessary libraries
!pip install rouge-score
!pip install pdfplumber
from transformers import BartTokenizer, BartForConditionalGeneration
import pdfplumber
import torch
import re
from rouge_score import rouge_scorer

# Load the BART model and tokenizer
model_path = "/content/drive/MyDrive/new dataset/model_bart_v2"  # Ensure the model path is correct
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function to extract text from multi-column PDF (same as before)
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)
            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()
            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text (same as before)
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract specific sections from cleaned text (same as before)
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text and generate summaries
def chunk_text(text, max_length=512):
    tokens = tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

# Function to generate short summaries with BART
def generate_short_summary(text_chunk):
    inputs = tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=100,  # Adjust for shorter summaries
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to summarize large text and generate short summary (same as before)
def summarize_large_text(text):
    chunks = chunk_text(text)
    short_summaries = [generate_short_summary(tokenizer.decode(chunk)) for chunk in chunks]
    combined_short_summary = ' '.join(short_summaries)
    return combined_short_summary

# Function to calculate ROUGE score (same as before)
def calculate_rouge(predicted_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, predicted_summary)
    return scores

# Process PDF and generate summaries for specific sections (same as before)
pdf_path = "/content/drive/MyDrive/new dataset/mll.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections (same as before)
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')

# Combine the extracted sections into a single text (same as before)
combined_text = f"{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

# Generate combined short summary
combined_short_summary = summarize_large_text(combined_text)

# Print the combined short summary
print("\nCombined Short Summary:\n", combined_short_summary)

# Reference summary for ROUGE score calculation
reference_summary = """
   The paper titled "Japanese Dairy Cattle Productivity Analysis using Bayesian Network Model (BNM)" explores the application of a Bayesian Network Model to enhance the productivity of dairy cattle in Japan. The study involved 280 Japanese anestrus Holstein dairy cows and aimed to accurately estimate the presence of estrous cycles, achieving an overall accuracy of approximately 55% with the sample data. The model utilized key parameters such as Body Condition Score (BCS), Postpartum Interval (PPI), and parity to evaluate the estrous cycle's presence, revealing that BCS significantly influences other productivity factors.

The research findings indicate that when BCS is at 2.5, with a PPI of 91-120 days and parity of 3, the probability of detecting an estrous cycle is 80%. The authors propose that the BNM can be further refined by incorporating additional parameters and validating the model with larger datasets, which would enhance its reliability and applicability in the livestock industry. The study emphasizes the importance of objective estimation methods over subjective ones, suggesting that the BNM could lead to improved productivity in the dairy sector not only in Japan but also in other countries.

In conclusion, the research presents a novel approach to understanding and managing dairy cattle productivity through the use of Bayesian networks, highlighting the potential for future advancements in the field.
"""

# Calculate ROUGE scores for BART
rouge_scores = calculate_rouge(combined_short_summary, reference_summary)

# Print the ROUGE scores
print("\nROUGE Scores:\n", rouge_scores)



Combined Short Summary:
 This research investigates the estimation of the presence of estrous cycle in Japanese dairy cows using Bayesian Network Model (BayES). It finds that the estimation accuracy of 270 Japanese anestrus Holstein dairy cows is almost 55% while using suspended likelihood, despite the limitations of previous methods. The study highlights the advantages of using BayES with other parameters, such as Body Condition Score and Parity Number, to enhance the accuracy of the estimation. Future research will focus on refining the model and This paper explores the role of estrous synchronization and other non-reproductive technologies in the cow and heifer reproduction process in the dairy industry. The estrous cycle of a cow or heifer is the period from one estrus (heat, phase of sexual activity) to the next, with a typical range of 18 to 24 days in length. It is a crucial factor in determining the progesterone level and the status of the heifer. The study evaluates various f

**Making of hybrid model**

In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber

# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    # Regex to find sections based on title and extract the following text until the next section
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text
def chunk_text(text, max_length=512):
    tokens = pegasus_tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(pegasus_tokenizer.decode(chunk)) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]
    combined_summary = ' '.join(refined_summaries)
    return combined_summary

# Process PDF and generate summaries
pdf_path = "/content/drive/MyDrive/Colab Notebooks/test5.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')
introduction_text = extract_section(cleaned_text, 'Introduction')

# Combine the extracted sections into a single text
combined_text = f"{introduction_text}\n{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

# Generate combined summary from both models
combined_summary = summarize_large_text(combined_text)

# Print the combined summary
print("\nCombined Summary:\n", combined_summary)


Collecting accelerate
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 

Token indices sequence length is longer than the specified maximum sequence length for this model (1085 > 512). Running this sequence through the model will result in indexing errors



Combined Summary:
 This research paper presents a suite of deep learning-based regression models that yield a very high level of accuracy in stock price prediction. The research paper uses historical stock price data of a well-known company listed in the National Stock Exchange (NSE) of India during the period December 31, 2012 to January 9, 2015. The stock prices are recorded at five minutes time interval during each working day in each week. The proposed models are foural neural networks () and five long- and short The paper addresses the challenge of forecasting future stock prices and stock price movement patterns by proposing a multi-time series regression model based on the gamutal neural network () for predicting financial time series and stock price movements. It highlights the limitations of existing models, which often fail to achieve high accuracy in predicting stock prices. The proposed model is designed on the basis of the gamutal neural network () for predicting financia

making the summary proepr without any repeated sentences and ending properly


In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    # Regex to find sections based on title and extract the following text until the next section
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text
def chunk_text(text, max_length=512):
    tokens = pegasus_tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to clean and ensure proper sentence boundaries
def clean_summary(summary):
    # Tokenize the summary into sentences
    sentences = sent_tokenize(summary)

    # Remove duplicate sentences (but allow one repetition)
    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        # Clean sentence from any unnecessary spaces or fragments
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'

        # Count sentence occurrences and keep it at most twice
        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1

        # Add sentence if it appears once or twice
        if sentence_count[sentence] <= 2:
            unique_sentences.append(sentence)

    # Join the unique sentences back into a cleaned summary
    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(pegasus_tokenizer.decode(chunk)) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]

    # Combine all refined summaries into one text
    combined_summary = ' '.join(refined_summaries)

    # Clean and ensure proper sentence boundaries and remove extra repetitions
    final_summary = clean_summary(combined_summary)

    return final_summary

# Process PDF and generate summaries
pdf_path = "/content/drive/MyDrive/Colab Notebooks/test5.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')
introduction_text = extract_section(cleaned_text, 'Introduction')

# Combine the extracted sections into a single text
combined_text = f"{introduction_text}\n{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

# Generate combined summary from both models
combined_summary = summarize_large_text(combined_text)

# Print the cleaned combined summary
print("\nCleaned Combined Summary:\n", combined_summary)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Token indices sequence length is longer than the specified maximum sequence length for this model (1085 > 512). Running this sequence through the model will result in indexing errors



Cleaned Combined Summary:
 This research paper presents a suite of deep learning-based regression models that yield a very high level of accuracy in stock price prediction. The research paper uses historical stock price data of a well-known company listed in the National Stock Exchange (NSE) of India during the period December 31, 2012 to January 9, 2015. The stock prices are recorded at five minutes time interval during each working day in each week. The proposed models are foural neural networks () and five long- and short The paper addresses the challenge of forecasting future stock prices and stock price movement patterns by proposing a multi-time series regression model based on the gamutal neural network () for predicting financial time series and stock price movements. It highlights the limitations of existing models, which often fail to achieve high accuracy in predicting stock prices. The proposed model is designed on the basis of the gamutal neural network () for predicting 

In [None]:
!pip install flask-ngrok
!pip install transformers torch pdfplumber nltk




In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber rouge_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re
from google.colab import files  # For uploading files in Google Colab
from rouge_score import rouge_scorer

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Define the reference summary directly in the code
reference_summary = """
Your reference summary text goes here.
This summary will be used to calculate the ROUGE score against the generated summary.
Ensure this text accurately represents the type of summaries you expect the model to generate.
"""

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text
def chunk_text(text, max_length=512):
    tokens = pegasus_tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to clean and ensure proper sentence boundaries
def clean_summary(summary):
    sentences = sent_tokenize(summary)

    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'

        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1

        if sentence_count[sentence] <= 2:
            unique_sentences.append(sentence)

    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(pegasus_tokenizer.decode(chunk)) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]

    combined_summary = ' '.join(refined_summaries)
    final_summary = clean_summary(combined_summary)

    return final_summary

# Function to calculate ROUGE score
def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores

# Main function to process PDF and summarize
def main():
    print("Welcome to the PDF Summarizer!")
    uploaded = files.upload()  # User uploads the PDF file

    for pdf_name in uploaded.keys():
        print(f"Processing file: {pdf_name}")
        document_text = extract_text_from_columns(pdf_name)
        cleaned_text = clean_text(document_text)

        print("Extracting relevant sections...")
        abstract_text = extract_section(cleaned_text, 'Abstract')
        results_text = extract_section(cleaned_text, 'Results')
        methodology_text = extract_section(cleaned_text, 'Methodology')
        conclusion_text = extract_section(cleaned_text, 'Conclusion')
        introduction_text = extract_section(cleaned_text, 'Introduction')

        combined_text = f"{introduction_text}\n{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

        print("Generating summary...")
        combined_summary = summarize_large_text(combined_text)

        print("\nFinal Summary:\n", combined_summary)

        # Calculate ROUGE scores
        rouge_scores = calculate_rouge(reference_summary, combined_summary)

        print("\nROUGE Scores:\n", rouge_scores)

if __name__ == "__main__":
    main()


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=47a63b2e383a1fdb42f28d0ecc84bf1db9f3881851799795b02d0a88b0fad5aa
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyboardInterrupt: 

making rouge score maximum

In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber rouge_score
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re
from google.colab import files  # For uploading files in Google Colab
from rouge_score import rouge_scorer

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Define the reference summary directly in the code
reference_summary = """
This paper focuses on the growing importance of identifying genuine user profiles on online social networks (OSNs) like Facebook, Instagram, and Twitter due to the rise in cyber frauds through fake accounts. OSNs, while enhancing virtual communication, expose users to threats like fake profiles, phishing, and trolls, which may lead to misuse of personal information or damage reputations.

To address this issue, the paper proposes a machine learning-based classification model that distinguishes between genuine and non-genuine profiles. The model uses datasets from Facebook, Instagram, and Twitter, which undergo preprocessing and feature extraction before being classified using algorithms such as Support Vector Machine (SVM), Neural Network, and Random Forest.

The results show that the Random Forest algorithm outperformed others, achieving the highest accuracy of 95% across all datasets. The model's performance is evaluated using metrics like recall, precision, and accuracy. The study concludes that an effective classification model can help safeguard users from cyber frauds, with Random Forest being the most reliable classifie.
"""

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text
def chunk_text(text, max_length=512):
    tokens = pegasus_tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to clean and ensure proper sentence boundaries
def clean_summary(summary):
    sentences = sent_tokenize(summary)

    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'

        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1

        if sentence_count[sentence] <= 2:
            unique_sentences.append(sentence)

    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(pegasus_tokenizer.decode(chunk)) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]

    combined_summary = ' '.join(refined_summaries)
    final_summary = clean_summary(combined_summary)

    return final_summary

# Function to calculate ROUGE score
def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, generated)
    return scores

# Main function to process PDF and summarize
def main():
    print("Welcome to the PDF Summarizer!")
    uploaded = files.upload()  # User uploads the PDF file

    for pdf_name in uploaded.keys():
        print(f"Processing file: {pdf_name}")
        document_text = extract_text_from_columns(pdf_name)
        cleaned_text = clean_text(document_text)

        print("Extracting relevant sections...")
        abstract_text = extract_section(cleaned_text, 'Abstract')
        results_text = extract_section(cleaned_text, 'Results')
        methodology_text = extract_section(cleaned_text, 'Methodology')
        conclusion_text = extract_section(cleaned_text, 'Conclusion')
        introduction_text = extract_section(cleaned_text, 'Introduction')

        combined_text = f"{introduction_text}\n{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

        print("Generating summary...")
        combined_summary = summarize_large_text(combined_text)

        print("\nFinal Summary:\n", combined_summary)

        # Calculate ROUGE scores
        rouge_scores = calculate_rouge(reference_summary, combined_summary)

        print("\nROUGE Scores:\n", rouge_scores)

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Welcome to the PDF Summarizer!


Saving CONIT2022Paper0627.pdf to CONIT2022Paper0627.pdf
Processing file: CONIT2022Paper0627.pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (8821 > 512). Running this sequence through the model will result in indexing errors


Extracting relevant sections...
Generating summary...

Final Summary:
 This paper presents a machine learning model designed to identify and classify user profiles as genuine or not genuine on online social networks. It aims to address the challenge of detecting social media users from cyber criminals by developing a machine learning model that identifies and classifies user profiles as genuine or not genuine. The proposed model achieved an average accuracy of 94% in the classification task considering all three datasets. The study highlights the importance of identifying genuine user profiles on social media The paper addresses the issue of fake profiles on social networking sites like Facebook, Twitter, LinkedIn, Orkut, MySpace. It highlights the growing problem of fake profiles and the need for an automated system to detect them. The paper proposes a novel algorithm based on machine learning (ML) to detect fake accounts using a combination of text and image recognition techniques. T

final code

In [None]:

!pip install transformers[torch] accelerate -U pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re

pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)


t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)


def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text


def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text


def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''


def chunk_text(text, max_length=512):
    tokens = pegasus_tokenizer.encode(text, truncation=False)
    chunks = [tokens[i:i + max_length] for i in range(0, len(tokens), max_length)]
    return chunks


def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary


def clean_summary(summary):
    sentences = sent_tokenize(summary)

    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'

        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1

        if sentence_count[sentence] <= 2:
            unique_sentences.append(sentence)

    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary


def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(pegasus_tokenizer.decode(chunk)) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]

    combined_summary = ' '.join(refined_summaries)
    final_summary = clean_summary(combined_summary)

    return final_summary


def main():
    print("Welcome to the PDF Summarizer!")
    uploaded = files.upload()

    for pdf_name in uploaded.keys():
        print(f"Processing file: {pdf_name}")
        document_text = extract_text_from_columns(pdf_name)
        cleaned_text = clean_text(document_text)

        print("Extracting relevant sections...")
        abstract_text = extract_section(cleaned_text, 'Abstract')
        results_text = extract_section(cleaned_text, 'Results')
        methodology_text = extract_section(cleaned_text, 'Methodology')
        conclusion_text = extract_section(cleaned_text, 'Conclusion')
        introduction_text = extract_section(cleaned_text, 'Introduction')

        combined_text = f"{introduction_text}\n{abstract_text}\n{results_text}\n{methodology_text}\n{conclusion_text}"

        print("Generating summary...")
        combined_summary = summarize_large_text(combined_text)

        print("\nFinal Summary:\n", combined_summary)

if __name__ == "__main__":
    main()




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Welcome to the PDF Summarizer!


Saving sodapdf-converted (1).pdf to sodapdf-converted (1).pdf
Processing file: sodapdf-converted (1).pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (1940 > 512). Running this sequence through the model will result in indexing errors


Extracting relevant sections...
Generating summary...

Final Summary:
 The paper addresses the challenge of detecting fake profiles on online social networks (OSNs) by developing a machine learning model that focuses on identifying genuine users rather than fake categories. The study highlights the importance of online social networks in changing people's views on social life during times of pandemics and global lockdowns. It proposes a machine learning-based approach to identify fake profiles, which are often used by hackers to gain access to sensitive information. The The paper addresses the challenge of detecting genuine user profiles on social networking sites, which have become increasingly popular due to their ease of use. It proposes a model based on Bayesian inference, which aims to identify and classify user profiles as genuine or ws. The proposed model achieved an average of all three datasets. The study highlights the importance of detecting genuine user profiles on social n

new

In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text by sentence
def chunk_text(text, max_length=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(pegasus_tokenizer.encode(current_chunk + sentence)) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=100):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=100):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to clean and ensure proper sentence boundaries
def clean_summary(summary):
    sentences = sent_tokenize(summary)
    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'
        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1
        if sentence_count[sentence] <= 1:
            unique_sentences.append(sentence)

    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(chunk) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]
    combined_summary = ' '.join(refined_summaries)
    final_summary = clean_summary(combined_summary)
    return final_summary

# Process PDF and generate summaries
pdf_path = "/content/drive/MyDrive/new dataset/mll.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')
introduction_text = extract_section(cleaned_text, 'Introduction')

# Summarize each section individually
section_summaries = []
sections = [
    ('Abstract', abstract_text),
    ('Introduction', introduction_text),
    ('Methodology', methodology_text),
    ('Results', results_text),
    ('Conclusion', conclusion_text)
]

for section_title, text in sections:
    section_summary = summarize_large_text(text)
    section_summaries.append(section_summary)

# Combine the section-wise summaries into a single comprehensive summary
combined_summary = " ".join(section_summaries)

# Clean the combined summary to remove duplicate sentences
final_combined_summary = clean_summary(combined_summary)

# Print the final combined summary
print("\nCombined Comprehensive Summary:\n")
print(final_combined_summary)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


FINAL CODE


In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber rouge-score
import nltk
from nltk.tokenize import sent_tokenize

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')  # Explicitly downloading punkt_tab to resolve the error


# Import libraries
import pdfplumber
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
import torch
import re
from rouge_score import rouge_scorer  # For ROUGE score

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)
t5_model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)

            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()

            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text by removing headers, footers, and references
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections from the cleaned text
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group().strip()
    return ''

# Function to chunk text by sentence
def chunk_text(text, max_length=512):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(pegasus_tokenizer.encode(current_chunk + sentence)) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Function to generate summaries using Pegasus
def generate_summary_pegasus(text_chunk, max_length=150):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=80,  # Ensuring enough length
        length_penalty=2.0,
        num_beams=5,  # Increase beams for more precision
        early_stopping=True
    )
    summary = pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Function to refine summaries using T5
def refine_summary_t5(pegasus_summary, max_length=150):
    inputs = t5_tokenizer(pegasus_summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=80,  # Ensuring refined summary keeps enough details
        length_penalty=2.0,
        num_beams=5,
        early_stopping=True
    )
    refined_summary = t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return refined_summary

# Function to clean and ensure proper sentence boundaries
def clean_summary(summary):
    sentences = sent_tokenize(summary)
    unique_sentences = []
    sentence_count = {}

    for sentence in sentences:
        sentence = sentence.strip()
        if len(sentence) > 0 and (sentence[-1] not in '.!?'):
            sentence += '.'
        if sentence in sentence_count:
            sentence_count[sentence] += 1
        else:
            sentence_count[sentence] = 1
        if sentence_count[sentence] <= 1:
            unique_sentences.append(sentence)

    cleaned_summary = ' '.join(unique_sentences)
    return cleaned_summary

# Function to summarize large text
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(chunk) for chunk in chunks]
    refined_summaries = [refine_summary_t5(summary) for summary in pegasus_summaries]
    combined_summary = ' '.join(refined_summaries)
    final_summary = clean_summary(combined_summary)
    return final_summary

# Function to calculate ROUGE score
def calculate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Process PDF and generate summaries
pdf_path = "/content/drive/MyDrive/new dataset/mll.pdf"
document_text = extract_text_from_columns(pdf_path)
cleaned_text = clean_text(document_text)

# Extract sections
abstract_text = extract_section(cleaned_text, 'Abstract')
results_text = extract_section(cleaned_text, 'Results')
methodology_text = extract_section(cleaned_text, 'Methodology')
conclusion_text = extract_section(cleaned_text, 'Conclusion')
introduction_text = extract_section(cleaned_text, 'Introduction')

# Summarize each section individually
section_summaries = []
sections = [
    ('Abstract', abstract_text),
    ('Introduction', introduction_text),
    ('Methodology', methodology_text),
    ('Results', results_text),
    ('Conclusion', conclusion_text)
]

for section_title, text in sections:
    if text:
        section_summary = summarize_large_text(text)
        section_summaries.append(section_summary)

# Combine the section-wise summaries into a single comprehensive summary
combined_summary = " ".join(section_summaries)

# Clean the combined summary to remove duplicate sentences
final_combined_summary = clean_summary(combined_summary)

# Print the final combined summary
print("\nCombined Comprehensive Summary:\n")
print(final_combined_summary)

# Example reference summary (previously generated by ChatGPT for evaluation)
reference_summary = '''.'''

# Calculate and print ROUGE score
rouge_scores = calculate_rouge(reference_summary, final_combined_summary)
print("\nROUGE Scores:\n")
print(rouge_scores)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors



Combined Comprehensive Summary:

This research explores the use of Bayesian Network Model () for the estimation of the presence of estrous cycle in Japanese dairy cows. Through an experiment with 280 Japanese anestrus Holstein dairy cows, it was found that the model achieved high accuracy in finding out the presence of estrous cycle while using suspended likelihood in sample datasets. The study highlights the advantages of Bayesian Network Model () over subjective methods for finding out the presence of estrous cycle in dairy cows. The research also reveals the optimum factors to find out the presence of estrous cycle among the 270 individual dairy cows. The findings suggest that the Bayesian Network Model () with the inclusion of Body Condition Para. This paper presents a novel method for identifying the presence or absence of estrous cycle in cattle using k Model. The method is based on the assumption that the cow has a normal estrous cycle, which is the period from one estrus (heat

In [None]:
# Install necessary libraries
!pip install transformers[torch] accelerate -U pdfplumber rouge-score

# Import libraries
import os
import re
import nltk
import torch
import pdfplumber
import ipywidgets as widgets
from IPython.display import display
from nltk.tokenize import sent_tokenize
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer
from google.colab import drive

# Download necessary NLTK data
nltk.download('punkt')

# Mount Google Drive
drive.mount('/content/drive')

# Load Pegasus model and tokenizer
pegasus_model_path = "/content/drive/MyDrive/new dataset/model_v2"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_path)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_path)

# Load T5 model and tokenizer
t5_model_path = "/content/drive/MyDrive/new dataset/model_t5_v2"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_path)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_path)

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pegasus_model.to(device)
t5_model.to(device)

# Function to extract text from multi-column PDF
def extract_text_from_columns(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            left_bbox = (0, 0, page.width / 2, page.height)
            right_bbox = (page.width / 2, 0, page.width, page.height)
            left_text = page.within_bbox(left_bbox).extract_text()
            right_text = page.within_bbox(right_bbox).extract_text()
            combined_text = (left_text or '') + ' ' + (right_text or '')
            text += combined_text + '\n'
    return text

# Function to clean text
def clean_text(text):
    cleaned_text = re.sub(r'\b(?:[A-Z][A-Z0-9 ]+|Page \d+|Header|Footer)\b', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\bReferences\b.*$', '', cleaned_text, flags=re.S)
    return cleaned_text

# Function to extract sections
def extract_section(text, section_title):
    pattern = rf'{section_title}[\s\S]*?(?=\n[A-Z][A-Z\s]+:|$)'
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group().strip() if match else ''

# Chunk text into token-length suitable for Pegasus
def chunk_text(text, max_length=512):
    sentences = sent_tokenize(text)
    chunks, current_chunk = [], ""
    for sentence in sentences:
        if len(pegasus_tokenizer.encode(current_chunk + sentence)) < max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# Generate summary using Pegasus
def generate_summary_pegasus(text_chunk, max_length=150):
    inputs = pegasus_tokenizer(text_chunk, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = pegasus_model.generate(inputs["input_ids"], max_length=max_length, min_length=80, length_penalty=2.0, num_beams=5, early_stopping=True)
    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Refine summary using T5
def refine_summary_t5(summary, max_length=150):
    inputs = t5_tokenizer(summary, return_tensors="pt", max_length=512, truncation=True, padding="longest")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    summary_ids = t5_model.generate(inputs["input_ids"], max_length=max_length, min_length=80, length_penalty=2.0, num_beams=5, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Clean final summary
def clean_summary(summary):
    sentences = sent_tokenize(summary)
    seen, cleaned = set(), []
    for s in sentences:
        s = s.strip()
        if s and s not in seen:
            seen.add(s)
            if s[-1] not in ".!?":
                s += "."
            cleaned.append(s)
    return ' '.join(cleaned)

# Summarize large text by chunking
def summarize_large_text(text):
    chunks = chunk_text(text)
    pegasus_summaries = [generate_summary_pegasus(chunk) for chunk in chunks]
    refined = [refine_summary_t5(s) for s in pegasus_summaries]
    return clean_summary(' '.join(refined))

# Calculate ROUGE
def calculate_rouge(reference, generated):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    return scorer.score(reference, generated)

# Path where papers are stored in Drive
papers_folder = "/content/drive/MyDrive/Papers"

# Ensure folder exists or provide fallback
if not os.path.exists(papers_folder):
    os.makedirs(papers_folder)
    print(f"Created folder: {papers_folder}. Please upload PDFs to this folder via Google Drive.")

pdf_files = [f for f in os.listdir(papers_folder) if f.endswith(".pdf")]

# UI Elements
file_dropdown = widgets.Dropdown(options=pdf_files, description='Paper:')
summarize_button = widgets.Button(description="Summarize", button_style="success")
output_area = widgets.Output()

# Summarization handler
def on_summarize_clicked(b):
    output_area.clear_output()
    with output_area:
        if not file_dropdown.value:
            print("Please select a PDF file.")
            return
        file_path = os.path.join(papers_folder, file_dropdown.value)
        print(f"Processing file: {file_path}")
        raw_text = extract_text_from_columns(file_path)
        cleaned = clean_text(raw_text)

        sections = [
            ('Abstract', extract_section(cleaned, 'Abstract')),
            ('Introduction', extract_section(cleaned, 'Introduction')),
            ('Methodology', extract_section(cleaned, 'Methodology')),
            ('Results', extract_section(cleaned, 'Results')),
            ('Conclusion', extract_section(cleaned, 'Conclusion'))
        ]

        summaries = []
        for title, content in sections:
            if content:
                print(f"Summarizing {title}...")
                summary = summarize_large_text(content)
                summaries.append(summary)

        final_summary = clean_summary(" ".join(summaries))
        print("\n===== Final Summary =====\n")
        print(final_summary)

summarize_button.on_click(on_summarize_clicked)

# Display UI
display(widgets.VBox([
    widgets.Label("Select a research paper from your Google Drive folder (MyDrive/Papers):"),
    file_dropdown,
    summarize_button,
    output_area
]))




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


VBox(children=(Label(value='Select a research paper from your Google Drive folder (MyDrive/Papers):'), Dropdow…