Step 1: Install the Required Libraries

In [None]:
! pip install transformers torch
! pip install pdfminer.six

Step 2: Extract Text from PDF

In [None]:
from pdfminer.high_level import extract_text_to_fp
from io import StringIO

def extract_text_from_pdf(pdf_path, start_page, end_page):
    # Extracts text from a PDF file, specifying the pages for Chapter 1
    text = ""
    output = StringIO()
    with open(pdf_path, 'rb') as f:
        extract_text_to_fp(f, output, page_numbers=range(start_page-1, end_page))
        text = output.getvalue()
    return text

In [None]:
# Define the path to the PDF and the pages to extract
pdf_path = "looma_sample_book.pdf"  # Update this to the path of your PDF file
start_page = 3  # Update this to the start page of Chapter 1
end_page = 4  # Update this to the end page of Chapter 1

In [None]:
# Extract text from the specified pages
chapter_text = extract_text_from_pdf(pdf_path, start_page, end_page)
print(chapter_text)

Step 3: Summarize Text Using Transformers

In [None]:
from transformers import pipeline

def summarize_text(text):
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary_text = summarizer(text, max_length=130, min_length=30, do_sample=False)
    return summary_text[0]['summary_text']

Step 4: Putting It All Together

In [None]:
pdf_path = "looma_sample_book.pdf"  # Update this to the path of your PDF file
start_page = 1  # Update this to the start page of Chapter 1
end_page = 2  # Update this to the end page of Chapter 1

# Extract text
chapter_text = extract_text_from_pdf(pdf_path, start_page, end_page)

# Generate summary
chapter_summary = summarize_text(chapter_text)
print(chapter_summary)

In [None]:
# ! pip install transformers datasets

In [None]:
from datasets import load_dataset

# Load the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
from transformers import T5Tokenizer

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenize the data
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=150, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import T5Tokenizer

# Load T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Tokenize the data
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=150, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Apply the preprocessing function to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
from pdfminer.high_level import extract_text_to_fp
from io import StringIO

def extract_text_from_pdf(pdf_path, start_page, end_page):
    text = ""
    output = StringIO()
    with open(pdf_path, 'rb') as f:
        extract_text_to_fp(f, output, page_numbers=range(start_page-1, end_page))
        text = output.getvalue()
    return text

# Define the path to the PDF and the pages to extract
pdf_path = "looma_sample_book.pdf"  # Update this to the path of your PDF file
start_page = 1  # Update this to the start page of Chapter 1
end_page = 2  # Update this to the end page of Chapter 1

# Extract text from the specified pages
chapter_text = extract_text_from_pdf(pdf_path, start_page, end_page)
print("Extracted Chapter Text:")
print(chapter_text)


In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the fine-tuned model and tokenizer
model_path = "./results/t5_fine_tuned"  # Path to the fine-tuned model
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Function to generate a summary
def generate_summary(chapter_text):
    inputs = tokenizer("summarize: " + chapter_text, return_tensors='pt', max_length=512, truncation=True)
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Generate and print the summary for the extracted chapter text
summary = generate_summary(chapter_text)
print("Summary:")
print(summary)


In [None]:
! pip install pymupdf pytesseract pillow


In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
import os

# Set the path to the Tesseract executable if it's not in your PATH
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'  # Update this path if necessary

def extract_text_and_images_from_pdf(pdf_path, image_output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    extracted_texts = []
    
    for page_num in range(len(pdf_document)):
        page = pdf_document[page_num]
        text = page.get_text()
        extracted_texts.append({'page': page_num, 'text': text})
        
        # Extract images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))
            image_path = os.path.join(image_output_folder, f"page_{page_num}_img_{img_index}.{image_ext}")
            image.save(image_path, format=image_ext.upper())
    
    return extracted_texts

# Example usage
pdf_path = 'path_to_your_pdf_file.pdf'
image_output_folder = 'path_to_output_images_folder'

if not os.path.exists(image_output_folder):
    os.makedirs(image_output_folder)

extracted_texts = extract_text_and_images_from_pdf(pdf_path, image_output_folder)

In [None]:
def ocr_image(image_path):
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img)
        return text
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Apply OCR to all images in the folder
ocr_texts = []

for filename in os.listdir(image_output_folder):
    if filename.endswith(".png") or filename.endswith(".jpg") or filename.endswith(".jpeg"):
        image_path = os.path.join(image_output_folder, filename)
        text = ocr_image(image_path)
        if text:
            ocr_texts.append({'image_filename': filename, 'text': text})

# Convert the OCR texts to a DataFrame
ocr_df = pd.DataFrame(ocr_texts)
print(ocr_df.head())

# Save the OCR texts to a CSV file
ocr_df.to_csv('ocr_extracted_texts.csv', index=False)


In [None]:
# Load OCR extracted texts
ocr_df = pd.read_csv('ocr_extracted_texts.csv')

# Combine extracted texts and OCR texts
combined_texts = []

for text in extracted_texts:
    page_num = text['page']
    text_content = text['text']
    
    # Add OCR text if available
    ocr_text_content = ""
    matching_ocr = ocr_df[ocr_df['image_filename'].str.contains(f"page_{page_num}_img_")]
    if not matching_ocr.empty:
        ocr_text_content = " ".join(matching_ocr['text'].tolist())
    
    combined_text = text_content + " " + ocr_text_content
    combined_texts.append({'page': page_num, 'text': combined_text})

combined_df = pd.DataFrame(combined_texts)
print(combined_df.head())

# Save the combined texts to a CSV file
combined_df.to_csv('combined_texts.csv', index=False)


In [None]:
# Clean the combined text
combined_df['cleaned_text'] = combined_df['text'].apply(clean_text)

# Split the data into training and validation sets
train_df, val_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['page', 'cleaned_text']])
val_dataset = Dataset.from_pandas(val_df[['page', 'cleaned_text']])

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

# Tokenize the data
tokenized_datasets = dataset_dict.map(preprocess_function, batched=True)
print(tokenized_datasets)


In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

# Assuming you have a CSV file with columns 'chapter' and 'text'
df = pd.read_csv('grade10_science_textbook.csv')
dataset = Dataset.from_pandas(df)

# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('meta-llama/llama-7b')

def preprocess_function(examples):
    inputs = examples['text']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

model = AutoModelForSeq2SeqLM.from_pretrained('meta-llama/llama-7b')

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# Example evaluation using ROUGE
from datasets import load_metric

rouge = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {key: value.mid.fmeasure for key, value in result.items()}

results = trainer.evaluate()
print(results)


In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Example usage
text = "Your textbook chapter text here."
summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
print(summary)


In [None]:
from flask import Flask, request, jsonify
app = Flask(__name__)

@app.route('/summarize', methods=['POST'])
def summarize():
    content = request.json
    text = content['text']
    summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
    return jsonify(summary)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
