**PREPROCSSING**

In [None]:
!pip install nltk scikit-learn numpy


In [None]:
import json
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data['body']  # Assuming the text is under the key 'body'

def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)

    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
        preprocessed_sentences.append(' '.join(words))

    return sentences, preprocessed_sentences

def rank_sentences(sentences, preprocessed_sentences, top_n=3):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

    # Compute sentence scores
    sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

    # Get top N sentences
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]

    # Sort the indices to maintain the original order of sentences
    top_sentence_indices.sort()

    # Extract the top sentences
    top_sentences = [sentences[i] for i in top_sentence_indices]

    return ' '.join(top_sentences)

def main():
    # Load the JSON file
    file_path = 'bbcsample.json'
    text = load_json(file_path)

    # Preprocess the text
    sentences, preprocessed_sentences = preprocess_text(text)

    # Generate extractive summary
    summary = rank_sentences(sentences, preprocessed_sentences, top_n=10)

    print("Original Text:\n", text)
    print("\nExtractive Summary:\n", summary)

if __name__ == "__main__":
    main()


In [None]:
import json
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data['body']  # Assuming the text is under the key 'body'

def preprocess_text(text):
    # Tokenize into sentences
    sentences = sent_tokenize(text)

    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Preprocess each sentence
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence.lower())
        words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
        preprocessed_sentences.append(' '.join(words))

    return sentences, preprocessed_sentences

def rank_sentences(sentences, preprocessed_sentences, top_n=30):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

    # Compute sentence scores
    sentence_scores = np.sum(tfidf_matrix.toarray(), axis=1)

    # Get top N sentences
    top_sentence_indices = np.argsort(sentence_scores)[-top_n:]

    # Sort the indices to maintain the original order of sentences
    top_sentence_indices.sort()

    # Extract the top sentences
    top_sentences = [sentences[i] for i in top_sentence_indices]

    return ' '.join(top_sentences)

def generate_abstractive_summary(extractive_summary):
    model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
    tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

    inputs = tokenizer([extractive_summary], max_length=4096, return_tensors="pt", truncation=True)

    # Generate Summary
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=500)
    abstractive_summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    return abstractive_summary

def main():
    # Load the JSON file
    file_path = 'bbcsample.json'
    text = load_json(file_path)

    # Preprocess the text
    sentences, preprocessed_sentences = preprocess_text(text)

    # Generate extractive summary
    extractive_summary = rank_sentences(sentences, preprocessed_sentences, top_n=30)

    # Generate abstractive summary
    abstractive_summary = generate_abstractive_summary(extractive_summary)

    print("Original Text:\n", text)
    print("\nExtractive Summary:\n", extractive_summary)
    print("\nAbstractive Summary:\n", abstractive_summary)

    # Save abstractive summary to JSON file
    output = {
        "extractive_summary": extractive_summary,
        "abstractive_summary": abstractive_summary
    }

    with open('summary_output.json', 'w') as outfile:
      json.dump(output, outfile)


if __name__ == "__main__":
    main()


In [None]:
import json

# Load the JSON file to get the abstractive summary
with open('summary_output.json', 'r') as infile:
    data = json.load(infile)

abstractive_summary = data['abstractive_summary']
print(abstractive_summary)

In [None]:
import os
from PIL import Image

# Path to the images folder
images_folder = 'images'

# Load images from the folder
image_files = [os.path.join(images_folder, img) for img in os.listdir(images_folder) if img.endswith(('.png', '.jpg', '.jpeg'))]
images = [Image.open(img_file) for img_file in image_files]


In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Prepare CLIP inputs
texts = [abstractive_summary] * len(images)  # Create a list with the summary repeated for each image

# Process images and texts
image_inputs = processor(images=images, return_tensors="pt", padding=True)
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77)

# Move tensors to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

# Calculate image-text similarity scores
with torch.no_grad():  # Disable gradient calculation
    outputs = model(**{**image_inputs, **text_inputs})

logits_per_image = outputs.logits_per_image  # Image-text similarity scores
probs = logits_per_image.softmax(dim=1)  # Convert logits to probabilities

# Check the shape of probs to ensure it matches your expectations
print(probs.shape)

# Extract top 5 image indices
num_top_images = min(5, probs.shape[1])  # Ensure we only take as many as are available
top_indices = torch.topk(probs, num_top_images, dim=1).indices[0].cpu().numpy()

# Print the names of the top 5 images
print(f"Top indices: {top_indices}")


In [None]:
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

# Load BLIP-2 model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
model.to(device)

# Function to generate caption
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = processor(image, return_tensors="pt").to(device, torch.float16)

    generated_ids = model.generate(**inputs, max_new_tokens=20)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

# Load top 5 image paths
top_images = [image_files[i] for i in top_indices]
print(top_images)

# Generate captions for top 5 images
captions = [generate_caption(img_path) for img_path in top_images]

# Concatenate captions
concatenated_captions = " ".join(captions)

print(concatenated_captions)


In [None]:
from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration

# Load BigBird-Pegasus model and tokenizer
bigbird_model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
bigbird_tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# Tokenize concatenated captions
inputs = bigbird_tokenizer([concatenated_captions], max_length=4096, return_tensors="pt", truncation=True)

# Generate summary
summary_ids = bigbird_model.generate(inputs["input_ids"], num_beams=4, max_length=200)
summary = bigbird_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Print or save the summary
print(summary)


In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import os

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Prepare CLIP inputs
texts = [summary] * len(top_images)  # Create a list with the summary repeated for each image
text_inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
image_inputs = clip_processor(images=[Image.open(img_path).convert("RGB") for img_path in top_images], return_tensors="pt", padding=True)

# Move tensors to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}

# Calculate image-text similarity scores
with torch.no_grad():  # Disable gradient calculation
    outputs = clip_model(**{**image_inputs, **text_inputs})

logits_per_image = outputs.logits_per_image  # Image-text similarity scores
probs = logits_per_image.softmax(dim=1)  # Convert logits to probabilities

# Extract top 2 image indices
num_top_images = min(2, probs.shape[1])  # Ensure we only take as many as are available
top_indices = torch.topk(probs, num_top_images, dim=1).indices[0].cpu().numpy()

# Print the names of the top 2 images
print(f"Top indices: {top_indices}")

# Load top 2 image paths
top_2_images = [top_images[i] for i in top_indices]

# Print or save the top 2 images
for idx, img_path in enumerate(top_2_images):
    print(f"Top image {idx + 1}: {img_path}")
