**PREPROCSSING**

In [None]:
!pip install nltk scikit-learn numpy


In [None]:
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_tweets(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data['tweets']  # Assuming tweets are stored under 'tweets' key

def preprocess_tweet(tweet):
    # Remove URLs, mentions, hashtags, and non-alphabetic characters
    tweet = re.sub(r'http\S+|@\S+|#\S+', '', tweet)

    # Tokenize and lowercase
    words = word_tokenize(tweet.lower())

    # Initialize lemmatizer and stopwords
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))

    # Lemmatize and remove stopwords and non-alphabetic tokens
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]

    return ' '.join(words)

def preprocess_tweets(tweets):
    preprocessed_tweets = [preprocess_tweet(tweet) for tweet in tweets]
    return tweets, preprocessed_tweets

def main():
    # Load the tweet file
    file_path = 'tweetsample.json'
    tweets = load_tweets(file_path)

    # Preprocess the tweets
    tweets, preprocessed_tweets = preprocess_tweets(tweets)

    print("Original Tweets:\n", tweets)
    print("\nPreprocessed Tweets:\n", preprocessed_tweets)

if __name__ == "__main__":
    main()


**EXTRACTIVE PHASE**

In [None]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Load SpaCy's English model
nlp = spacy.load('en_core_web_sm')

def get_bigrams(preprocessed_tweets):
    bigrams = []

    for tweet in preprocessed_tweets:
        doc = nlp(tweet)
        tokens = [token.text for token in doc if token.is_alpha]

        # Generate bigrams (pairs of adjacent tokens)
        bigrams += [f'{tokens[i]} {tokens[i+1]}' for i in range(len(tokens)-1)]

    return bigrams

def calculate_tfidf_bigrams(preprocessed_tweets, top_k=10):
    # Generate bigrams
    bigrams = get_bigrams(preprocessed_tweets)

    # Calculate TF-IDF for bigrams
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(bigrams)

    # Get the top k bigrams based on TF-IDF score
    bigram_scores = np.sum(tfidf_matrix.toarray(), axis=0)
    top_bigram_indices = np.argsort(bigram_scores)[-top_k:]
    top_bigrams = [vectorizer.get_feature_names_out()[i] for i in top_bigram_indices]

    return top_bigrams

def retrieve_tweets_with_bigrams(tweets, preprocessed_tweets, top_bigrams):
    selected_tweets = []

    for original_tweet, preprocessed_tweet in zip(tweets, preprocessed_tweets):
        # Check if any of the top bigrams appear in the tweet
        for bigram in top_bigrams:
            if bigram in preprocessed_tweet:
                selected_tweets.append(original_tweet)
                break  # Avoid adding the same tweet multiple times

    return selected_tweets

def rank_tweets(tweets, preprocessed_tweets, top_k=10):
    # Step 1: Get top bigrams based on TF-IDF
    top_bigrams = calculate_tfidf_bigrams(preprocessed_tweets, top_k)

    # Step 2: Retrieve tweets containing these bigrams
    selected_tweets = retrieve_tweets_with_bigrams(tweets, preprocessed_tweets, top_bigrams)

    # Step 3: Concatenate the selected tweets to form the summary
    summary = ' '.join(selected_tweets)

    return summary

def main():
    # Load the tweet file
    file_path = 'tweetsample.json'
    tweets = load_tweets(file_path)

    # Preprocess the tweets
    tweets, preprocessed_tweets = preprocess_tweets(tweets)

    # Generate extractive summary based on bigram ranking
    summary = rank_tweets(tweets, preprocessed_tweets, top_k=10)

    print("\nExtractive Summary of Tweets:\n", summary)

if __name__ == "__main__":
    main()


**ABSTRACTIVE PHASE**

In [None]:
import json

# Load the JSON file to get the abstractive summary
with open('summary_output.json', 'r') as infile:
    data = json.load(infile)

abstractive_summary = data['abstractive_summary']
print(abstractive_summary)

In [None]:
import os
from PIL import Image

# Path to the images folder
images_folder = 'images'

# Load images from the folder
image_files = [os.path.join(images_folder, img) for img in os.listdir(images_folder) if img.endswith(('.png', '.jpg', '.jpeg'))]
images = [Image.open(img_file) for img_file in image_files]


In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Prepare CLIP inputs
texts = [abstractive_summary] * len(images)  # Create a list with the summary repeated for each image

# Process images and texts
image_inputs = processor(images=images, return_tensors="pt", padding=True)
text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77)

# Move tensors to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}

# Calculate image-text similarity scores
with torch.no_grad():  # Disable gradient calculation
    outputs = model(**{**image_inputs, **text_inputs})

logits_per_image = outputs.logits_per_image  # Image-text similarity scores
probs = logits_per_image.softmax(dim=1)  # Convert logits to probabilities

# Check the shape of probs to ensure it matches your expectations
print(probs.shape)

# Extract top 10 image indices
num_top_images = min(10, probs.shape[1])  # Ensure we only take as many as are available
top_indices = torch.topk(probs, num_top_images, dim=1).indices[0].cpu().numpy()

# Print the names of the top 10 images
print(f"Top indices: {top_indices}")


In [None]:
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

# Load BLIP-2 model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
model.to(device)

# Function to generate caption
def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")

    inputs = processor(image, return_tensors="pt").to(device, torch.float16)

    generated_ids = model.generate(**inputs, max_new_tokens=20)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

# Load top 10 image paths
top_images = [image_files[i] for i in top_indices]
print(top_images)

# Generate captions for top 10 images
captions = [generate_caption(img_path) for img_path in top_images]

# Concatenate captions
concatenated_captions = " ".join(captions)

print(concatenated_captions)


In [None]:
from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration

# Load BigBird-Pegasus model and tokenizer
bigbird_model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")
bigbird_tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# Tokenize concatenated captions
inputs = bigbird_tokenizer([concatenated_captions], max_length=4096, return_tensors="pt", truncation=True)

# Generate summary
summary_ids = bigbird_model.generate(inputs["input_ids"], num_beams=4, max_length=200)
summary = bigbird_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Print or save the summary
print(summary)


In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch
from PIL import Image
import os

# Load CLIP model and processor
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Prepare CLIP inputs
texts = [summary] * len(top_images)  # Create a list with the summary repeated for each image
text_inputs = clip_processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77)
image_inputs = clip_processor(images=[Image.open(img_path).convert("RGB") for img_path in top_images], return_tensors="pt", padding=True)

# Move tensors to the appropriate device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
clip_model.to(device)
text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
image_inputs = {k: v.to(device) for k, v in image_inputs.items()}

# Calculate image-text similarity scores
with torch.no_grad():  # Disable gradient calculation
    outputs = clip_model(**{**image_inputs, **text_inputs})

logits_per_image = outputs.logits_per_image  # Image-text similarity scores
probs = logits_per_image.softmax(dim=1)  # Convert logits to probabilities

# Extract top 4 image indices
num_top_images = min(4, probs.shape[1])  # Ensure we only take as many as are available
top_indices = torch.topk(probs, num_top_images, dim=1).indices[0].cpu().numpy()

# Print the names of the top 4 images
print(f"Top indices: {top_indices}")

# Load top 2 image paths
top_4_images = [top_images[i] for i in top_indices]

# Print or save the top 4 images
for idx, img_path in enumerate(top_4_images):
    print(f"Top image {idx + 1}: {img_path}")
